Total coverage: 15606 (2%)of 1195162
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ #include <linux/init.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/if_arp.h> #include <linux/list.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include <rdma/rdma_netlink.h> #include <linux/kthread.h> #include "siw.h" #include "siw_verbs.h" MODULE_AUTHOR("Bernard Metzler"); MODULE_DESCRIPTION("Software iWARP Driver"); MODULE_LICENSE("Dual BSD/GPL"); /* transmit from user buffer, if possible */ const bool zcopy_tx = true; /* Restrict usage of GSO, if hardware peer iwarp is unable to process * large packets. try_gso = true lets siw try to use local GSO, * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. */ const bool try_gso; /* Attach siw also with loopback devices */ const bool loopback_enabled = true; /* We try to negotiate CRC on, if true */ const bool mpa_crc_required; /* MPA CRC on/off enforced */ const bool mpa_crc_strict; /* Control TCP_NODELAY socket option */ const bool siw_tcp_nagle; /* Select MPA version to be used during connection setup */ u_char mpa_version = MPA_REVISION_2; /* Selects MPA P2P mode (additional handshake during connection * setup, if true. */ const bool peer_to_peer; struct task_struct *siw_tx_thread[NR_CPUS]; static int siw_device_register(struct siw_device *sdev, const char *name) { struct ib_device *base_dev = &sdev->base_dev; static int dev_id = 1; int rv; sdev->vendor_part_id = dev_id++; rv = ib_register_device(base_dev, name, NULL); if (rv) { pr_warn("siw: device registration error %d\n", rv); return rv; } siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid); return 0; } static void siw_device_cleanup(struct ib_device *base_dev) { struct siw_device *sdev = to_siw_dev(base_dev); xa_destroy(&sdev->qp_xa); xa_destroy(&sdev->mem_xa); } static int siw_dev_qualified(struct net_device *netdev) { /* * Additional hardware support can be added here * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see * <linux/if_arp.h> for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; return 0; } static DEFINE_PER_CPU(atomic_t, siw_use_cnt); static struct { struct cpumask **tx_valid_cpus; int num_nodes; } siw_cpu_info; static void siw_destroy_cpulist(int number) { int i = 0; while (i < number) kfree(siw_cpu_info.tx_valid_cpus[i++]); kfree(siw_cpu_info.tx_valid_cpus); siw_cpu_info.tx_valid_cpus = NULL; } static int siw_init_cpulist(void) { int i, num_nodes = nr_node_ids; memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); siw_cpu_info.num_nodes = num_nodes; siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus) { siw_cpu_info.num_nodes = 0; return -ENOMEM; } for (i = 0; i < siw_cpu_info.num_nodes; i++) { siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!siw_cpu_info.tx_valid_cpus[i]) goto out_err; cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); } for_each_possible_cpu(i) cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); return 0; out_err: siw_cpu_info.num_nodes = 0; siw_destroy_cpulist(i); return -ENOMEM; } /* * Choose CPU with least number of active QP's from NUMA node of * TX interface. */ int siw_get_tx_cpu(struct siw_device *sdev) { const struct cpumask *tx_cpumask; int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; if (node < 0) tx_cpumask = cpu_online_mask; else tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; num_cpus = cpumask_weight(tx_cpumask); if (!num_cpus) { /* no CPU on this NUMA node */ tx_cpumask = cpu_online_mask; num_cpus = cpumask_weight(tx_cpumask); } if (!num_cpus) goto out; cpu = cpumask_first(tx_cpumask); for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; i++, cpu = cpumask_next(cpu, tx_cpumask)) { int usage; /* Skip any cores which have no TX thread */ if (!siw_tx_thread[cpu]) continue; usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); if (usage <= min_use) { tx_cpu = cpu; min_use = usage; } } siw_dbg(&sdev->base_dev, "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); out: if (tx_cpu >= 0) atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); else pr_warn("siw: no tx cpu found\n"); return tx_cpu; } void siw_put_tx_cpu(int cpu) { atomic_dec(&per_cpu(siw_use_cnt, cpu)); } static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) { struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); if (qp) { /* * siw_qp_id2obj() increments object reference count */ siw_qp_put(qp); return &qp->base_qp; } return NULL; } static const struct ib_device_ops siw_device_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = SIW_ABI_VERSION, .driver_id = RDMA_DRIVER_SIW, .alloc_mr = siw_alloc_mr, .alloc_pd = siw_alloc_pd, .alloc_ucontext = siw_alloc_ucontext, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_driver = siw_device_cleanup, .dealloc_pd = siw_dealloc_pd, .dealloc_ucontext = siw_dealloc_ucontext, .dereg_mr = siw_dereg_mr, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, .get_dma_mr = siw_get_dma_mr, .get_port_immutable = siw_get_port_immutable, .iw_accept = siw_accept, .iw_add_ref = siw_qp_get_ref, .iw_connect = siw_connect, .iw_create_listen = siw_create_listen, .iw_destroy_listen = siw_destroy_listen, .iw_get_qp = siw_get_base_qp, .iw_reject = siw_reject, .iw_rem_ref = siw_qp_put_ref, .map_mr_sg = siw_map_mr_sg, .mmap = siw_mmap, .mmap_free = siw_mmap_free, .modify_qp = siw_verbs_modify_qp, .modify_srq = siw_modify_srq, .poll_cq = siw_poll_cq, .post_recv = siw_post_receive, .post_send = siw_post_send, .post_srq_recv = siw_post_srq_recv, .query_device = siw_query_device, .query_gid = siw_query_gid, .query_port = siw_query_port, .query_qp = siw_query_qp, .query_srq = siw_query_srq, .req_notify_cq = siw_req_notify_cq, .reg_user_mr = siw_reg_user_mr, INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp), INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), }; static struct siw_device *siw_device_create(struct net_device *netdev) { struct siw_device *sdev = NULL; struct ib_device *base_dev; int rv; sdev = ib_alloc_device(siw_device, base_dev); if (!sdev) return NULL; base_dev = &sdev->base_dev; if (netdev->addr_len) { memcpy(sdev->raw_gid, netdev->dev_addr, min_t(unsigned int, netdev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(sdev->raw_gid); } addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid); base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND); base_dev->node_type = RDMA_NODE_RNIC; memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, sizeof(SIW_NODE_DESC_COMMON)); /* * Current model (one-to-one device association): * One Softiwarp device per net_device or, equivalently, * per physical port. */ base_dev->phys_port_cnt = 1; base_dev->num_comp_vectors = num_possible_cpus(); xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); ib_set_device_ops(base_dev, &siw_device_ops); rv = ib_device_set_netdev(base_dev, netdev, 1); if (rv) goto error; memcpy(base_dev->iw_ifname, netdev->name, sizeof(base_dev->iw_ifname)); /* Disable TCP port mapping */ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP; sdev->attrs.max_qp = SIW_MAX_QP; sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; sdev->attrs.max_ord = SIW_MAX_ORD_QP; sdev->attrs.max_ird = SIW_MAX_IRD_QP; sdev->attrs.max_sge = SIW_MAX_SGE; sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; sdev->attrs.max_cq = SIW_MAX_CQ; sdev->attrs.max_cqe = SIW_MAX_CQE; sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; INIT_LIST_HEAD(&sdev->cep_list); INIT_LIST_HEAD(&sdev->qp_list); atomic_set(&sdev->num_ctx, 0); atomic_set(&sdev->num_srq, 0); atomic_set(&sdev->num_qp, 0); atomic_set(&sdev->num_cq, 0); atomic_set(&sdev->num_mr, 0); atomic_set(&sdev->num_pd, 0); sdev->numa_node = dev_to_node(&netdev->dev); spin_lock_init(&sdev->lock); return sdev; error: ib_dealloc_device(base_dev); return NULL; } static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { struct net_device *netdev = netdev_notifier_info_to_dev(arg); struct ib_device *base_dev; struct siw_device *sdev; dev_dbg(&netdev->dev, "siw: event %lu\n", event); base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (!base_dev) return NOTIFY_OK; sdev = to_siw_dev(base_dev); switch (event) { case NETDEV_REGISTER: /* * Device registration now handled only by * rdma netlink commands. So it shall be impossible * to end up here with a valid siw device. */ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); break; case NETDEV_UNREGISTER: ib_unregister_device_queued(&sdev->base_dev); break; case NETDEV_CHANGEADDR: siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); break; /* * All other events are not handled */ default: break; } ib_device_put(&sdev->base_dev); return NOTIFY_OK; } static struct notifier_block siw_netdev_nb = { .notifier_call = siw_netdev_event, }; static int siw_newlink(const char *basedev_name, struct net_device *netdev) { struct ib_device *base_dev; struct siw_device *sdev = NULL; int rv = -ENOMEM; if (!siw_dev_qualified(netdev)) return -EINVAL; base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); if (base_dev) { ib_device_put(base_dev); return -EEXIST; } sdev = siw_device_create(netdev); if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) ib_dealloc_device(&sdev->base_dev); } return rv; } static struct rdma_link_ops siw_link_ops = { .type = "siw", .newlink = siw_newlink, }; /* * siw_init_module - Initialize Softiwarp module and register with netdev * subsystem. */ static __init int siw_init_module(void) { int rv; if (SENDPAGE_THRESH < SIW_MAX_INLINE) { pr_info("siw: sendpage threshold too small: %u\n", (int)SENDPAGE_THRESH); rv = -EINVAL; goto out_error; } rv = siw_init_cpulist(); if (rv) goto out_error; rv = siw_cm_init(); if (rv) goto out_error; if (!siw_create_tx_threads()) { pr_info("siw: Could not start any TX thread\n"); rv = -ENOMEM; goto out_error; } rv = register_netdevice_notifier(&siw_netdev_nb); if (rv) goto out_error; rdma_link_register(&siw_link_ops); pr_info("SoftiWARP attached\n"); return 0; out_error: siw_stop_tx_threads(); pr_info("SoftIWARP attach failed. Error: %d\n", rv); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); return rv; } static void __exit siw_exit_module(void) { siw_stop_tx_threads(); unregister_netdevice_notifier(&siw_netdev_nb); rdma_link_unregister(&siw_link_ops); ib_unregister_driver(RDMA_DRIVER_SIW); siw_cm_exit(); siw_destroy_cpulist(siw_cpu_info.num_nodes); pr_info("SoftiWARP detached\n"); } module_init(siw_init_module); module_exit(siw_exit_module); MODULE_ALIAS_RDMA_LINK("siw");
38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 #ifndef _ASM_ARM64_VMALLOC_H #define _ASM_ARM64_VMALLOC_H #include <asm/page.h> #include <asm/pgtable.h> #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #define arch_vmap_pud_supported arch_vmap_pud_supported static inline bool arch_vmap_pud_supported(pgprot_t prot) { /* * SW table walks can't handle removal of intermediate entries. */ return pud_sect_supported() && !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } #define arch_vmap_pmd_supported arch_vmap_pmd_supported static inline bool arch_vmap_pmd_supported(pgprot_t prot) { /* See arch_vmap_pud_supported() */ return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } #define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end, u64 pfn, unsigned int max_page_shift) { /* * If the block is at least CONT_PTE_SIZE in size, and is naturally * aligned in both virtual and physical space, then we can pte-map the * block using the PTE_CONT bit for more efficient use of the TLB. */ if (max_page_shift < CONT_PTE_SHIFT) return PAGE_SIZE; if (end - addr < CONT_PTE_SIZE) return PAGE_SIZE; if (!IS_ALIGNED(addr, CONT_PTE_SIZE)) return PAGE_SIZE; if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE)) return PAGE_SIZE; return CONT_PTE_SIZE; } #define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr, pte_t *ptep) { /* * The caller handles alignment so it's sufficient just to check * PTE_CONT. */ return pte_valid_cont(__ptep_get(ptep)) ? CONT_PTE_SIZE : PAGE_SIZE; } #define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift static inline int arch_vmap_pte_supported_shift(unsigned long size) { if (size >= CONT_PTE_SIZE) return CONT_PTE_SHIFT; return PAGE_SHIFT; } #endif #define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot) { return pgprot_tagged(prot); } #endif /* _ASM_ARM64_VMALLOC_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2020 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. */ #ifndef IB_VERBS_H #define IB_VERBS_H #include <linux/ethtool.h> #include <linux/types.h> #include <linux/device.h> #include <linux/dma-mapping.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rwsem.h> #include <linux/workqueue.h> #include <linux/irq_poll.h> #include <uapi/linux/if_ether.h> #include <net/ipv6.h> #include <net/ip.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/refcount.h> #include <linux/if_link.h> #include <linux/atomic.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/cgroup_rdma.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/dim.h> #include <uapi/rdma/ib_user_verbs.h> #include <rdma/rdma_counter.h> #include <rdma/restrack.h> #include <rdma/signature.h> #include <uapi/rdma/rdma_user_ioctl.h> #include <uapi/rdma/ib_user_ioctl_verbs.h> #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN struct ib_umem_odp; struct ib_uqp_object; struct ib_usrq_object; struct ib_uwq_object; struct rdma_cm_id; struct ib_port; struct hw_stats_device_data; extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_alert(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_crit(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_err(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_warn(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else __printf(2, 3) __cold static inline void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {} #endif #define ibdev_level_ratelimited(ibdev_level, ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ ibdev_level(ibdev, fmt, ##__VA_ARGS__); \ } while (0) #define ibdev_emerg_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_emerg, ibdev, fmt, ##__VA_ARGS__) #define ibdev_alert_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_alert, ibdev, fmt, ##__VA_ARGS__) #define ibdev_crit_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_crit, ibdev, fmt, ##__VA_ARGS__) #define ibdev_err_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_err, ibdev, fmt, ##__VA_ARGS__) #define ibdev_warn_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_warn, ibdev, fmt, ##__VA_ARGS__) #define ibdev_notice_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_notice, ibdev, fmt, ##__VA_ARGS__) #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && __ratelimit(&_rs)) \ __dynamic_ibdev_dbg(&descriptor, ibdev, fmt, \ ##__VA_ARGS__); \ } while (0) #else __printf(2, 3) __cold static inline void ibdev_dbg_ratelimited(const struct ib_device *ibdev, const char *format, ...) {} #endif union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; extern union ib_gid zgid; enum ib_gid_type { IB_GID_TYPE_IB = IB_UVERBS_GID_TYPE_IB, IB_GID_TYPE_ROCE = IB_UVERBS_GID_TYPE_ROCE_V1, IB_GID_TYPE_ROCE_UDP_ENCAP = IB_UVERBS_GID_TYPE_ROCE_V2, IB_GID_TYPE_SIZE }; #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { struct net_device __rcu *ndev; struct ib_device *device; union ib_gid gid; enum ib_gid_type gid_type; u16 index; u32 port_num; }; enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, RDMA_TRANSPORT_USNIC, RDMA_TRANSPORT_USNIC_UDP, RDMA_TRANSPORT_UNSPECIFIED, }; enum rdma_protocol_type { RDMA_PROTOCOL_IB, RDMA_PROTOCOL_IBOE, RDMA_PROTOCOL_IWARP, RDMA_PROTOCOL_USNIC_UDP }; __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type); enum rdma_network_type { RDMA_NETWORK_IB, RDMA_NETWORK_ROCE_V1, RDMA_NETWORK_IPV4, RDMA_NETWORK_IPV6 }; static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) { if (network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) return IB_GID_TYPE_ROCE_UDP_ENCAP; else if (network_type == RDMA_NETWORK_ROCE_V1) return IB_GID_TYPE_ROCE; else return IB_GID_TYPE_IB; } static inline enum rdma_network_type rdma_gid_attr_network_type(const struct ib_gid_attr *attr) { if (attr->gid_type == IB_GID_TYPE_IB) return RDMA_NETWORK_IB; if (attr->gid_type == IB_GID_TYPE_ROCE) return RDMA_NETWORK_ROCE_V1; if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid)) return RDMA_NETWORK_IPV4; else return RDMA_NETWORK_IPV6; } enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR, IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR, IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR, IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI, IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG, IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT, IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE, IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD, IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT, /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */ IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT, IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID, IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN, IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE, IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ, /* Reserved, old SEND_W_INV = 1 << 16,*/ IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW, /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM, IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC, /* * This device supports the IB "base memory management extension", * which includes support for fast registrations (IB_WR_REG_MR, * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should * also be set by any iWarp device which must support FRs to comply * to the iWarp verbs spec. iWarp devices also support the * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS, IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A, IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B, IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM, /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */ IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM, IB_DEVICE_MANAGED_FLOW_STEERING = IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING, /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS, /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, /* Placement type attributes */ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { /* * This device supports a per-device lkey or stag that can be * used without performing a memory registration for the local * memory. Note that ULPs should never check this flag, but * instead of use the local_dma_lkey flag in the ib_pd structure, * which will always contain a usable lkey. */ IBK_LOCAL_DMA_LKEY = 1 << 0, /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */ IBK_INTEGRITY_HANDOVER = 1 << 1, /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */ IBK_ON_DEMAND_PAGING = 1 << 2, /* IB_MR_TYPE_SG_GAPS is supported */ IBK_SG_GAPS_REG = 1 << 3, /* Driver supports RDMA_NLDEV_CMD_DELLINK */ IBK_ALLOW_USER_UNREG = 1 << 4, /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */ IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5, /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */ IBK_UD_TSO = 1 << 6, /* iopib will use the device ops: * get_vf_config * get_vf_guid * get_vf_stats * set_vf_guid * set_vf_link_state */ IBK_VIRTUAL_FUNCTION = 1 << 7, /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */ IBK_RDMA_NETDEV_OPA = 1 << 8, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; enum ib_odp_general_cap_bits { IB_ODP_SUPPORT = IB_UVERBS_ODP_SUPPORT, IB_ODP_SUPPORT_IMPLICIT = IB_UVERBS_ODP_SUPPORT_IMPLICIT, }; enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_SEND = IB_UVERBS_ODP_SUPPORT_SEND, IB_ODP_SUPPORT_RECV = IB_UVERBS_ODP_SUPPORT_RECV, IB_ODP_SUPPORT_WRITE = IB_UVERBS_ODP_SUPPORT_WRITE, IB_ODP_SUPPORT_READ = IB_UVERBS_ODP_SUPPORT_READ, IB_ODP_SUPPORT_ATOMIC = IB_UVERBS_ODP_SUPPORT_ATOMIC, IB_ODP_SUPPORT_SRQ_RECV = IB_UVERBS_ODP_SUPPORT_SRQ_RECV, IB_ODP_SUPPORT_FLUSH = IB_UVERBS_ODP_SUPPORT_FLUSH, IB_ODP_SUPPORT_ATOMIC_WRITE = IB_UVERBS_ODP_SUPPORT_ATOMIC_WRITE, }; struct ib_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; uint32_t xrc_odp_caps; } per_transport_caps; }; struct ib_rss_caps { /* Corresponding bit will be set if qp type from * 'enum ib_qp_type' is supported, e.g. * supported_qpts |= 1 << IB_QPT_UD */ u32 supported_qpts; u32 max_rwq_indirection_tables; u32 max_rwq_indirection_table_size; }; enum ib_tm_cap_flags { /* Support tag matching with rendezvous offload for RC transport */ IB_TM_CAP_RNDV_RC = 1 << 0, }; struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ u32 max_num_tags; /* From enum ib_tm_cap_flags */ u32 flags; /* Max number of outstanding list operations */ u32 max_ops; /* Max number of SGE in tag matching entry */ u32 max_sge; }; struct ib_cq_init_attr { unsigned int cqe; u32 comp_vector; u32 flags; }; enum ib_cq_attr_mask { IB_CQ_MODERATE = 1 << 0, }; struct ib_cq_caps { u16 max_cq_moderation_count; u16 max_cq_moderation_period; }; struct ib_dm_mr_attr { u64 length; u64 offset; u32 access_flags; }; struct ib_dm_alloc_attr { u64 length; u32 alignment; u32 flags; }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; u64 kernel_cap_flags; int max_send_sge; int max_recv_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; unsigned int max_pi_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; uint64_t timestamp_mask; uint64_t hca_core_clock; /* in KHZ */ struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ struct ib_tm_caps tm_caps; struct ib_cq_caps cq_caps; u64 max_dm_size; /* Max entries for sgl for optimized performance per READ */ u32 max_sgl_rd; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; enum opa_mtu { OPA_MTU_8192 = 6, OPA_MTU_10240 = 7 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } static inline enum ib_mtu ib_mtu_int_to_enum(int mtu) { if (mtu >= 4096) return IB_MTU_4096; else if (mtu >= 2048) return IB_MTU_2048; else if (mtu >= 1024) return IB_MTU_1024; else if (mtu >= 512) return IB_MTU_512; else return IB_MTU_256; } static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) { switch (mtu) { case OPA_MTU_8192: return 8192; case OPA_MTU_10240: return 10240; default: return(ib_mtu_enum_to_int((enum ib_mtu)mtu)); } } static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) { if (mtu >= 10240) return OPA_MTU_10240; else if (mtu >= 8192) return OPA_MTU_8192; else return ((enum opa_mtu)ib_mtu_int_to_enum(mtu)); } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5 }; static inline const char *__attribute_const__ ib_port_state_to_str(enum ib_port_state state) { const char * const states[] = { [IB_PORT_NOP] = "NOP", [IB_PORT_DOWN] = "DOWN", [IB_PORT_INIT] = "INIT", [IB_PORT_ARMED] = "ARMED", [IB_PORT_ACTIVE] = "ACTIVE", [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER", }; if (state < ARRAY_SIZE(states)) return states[state]; return "UNKNOWN"; } enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, IB_PORT_PHYS_STATE_DISABLED = 3, IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4, IB_PORT_PHYS_STATE_LINK_UP = 5, IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6, IB_PORT_PHYS_STATE_PHY_TEST = 7, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_2X = 16, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_2X: return 2; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32, IB_SPEED_HDR = 64, IB_SPEED_NDR = 128, IB_SPEED_XDR = 256, }; enum ib_stat_flag { IB_STAT_FLAG_OPTIONAL = 1 << 0, }; /** * struct rdma_stat_desc * @name - The name of the counter * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL * @priv - Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; unsigned int flags; const void *priv; }; /** * struct rdma_hw_stats * @lock - Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. * @timestamp - Used by the core code to track when the last update was * @lifespan - Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. * @descs - Array of pointers to static descriptors used for the counters * in directory. * @is_disabled - A bitmap to indicate each counter is currently disabled * or not. * @num_counters - How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. * @value - Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { struct mutex lock; /* Protect lifespan and values[] */ unsigned long timestamp; unsigned long lifespan; const struct rdma_stat_desc *descs; unsigned long *is_disabled; int num_counters; u64 value[] __counted_by(num_counters); }; #define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const struct rdma_stat_desc *descs, int num_counters, unsigned long lifespan); void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats); /* Define bits for the various functionality this port needs to be supported by * the core. */ /* Management 0x00000FFF */ #define RDMA_CORE_CAP_IB_MAD 0x00000001 #define RDMA_CORE_CAP_IB_SMI 0x00000002 #define RDMA_CORE_CAP_IB_CM 0x00000004 #define RDMA_CORE_CAP_IW_CM 0x00000008 #define RDMA_CORE_CAP_IB_SA 0x00000010 #define RDMA_CORE_CAP_OPA_MAD 0x00000020 /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 #define RDMA_CORE_CAP_OPA_AH 0x00004000 #define RDMA_CORE_CAP_IB_GRH_REQUIRED 0x00008000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_CAP_PROT_RAW_PACKET 0x01000000 #define RDMA_CORE_CAP_PROT_USNIC 0x02000000 #define RDMA_CORE_PORT_IB_GRH_REQUIRED (RDMA_CORE_CAP_IB_GRH_REQUIRED \ | RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP) #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_SMI \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_IB_SA \ | RDMA_CORE_CAP_AF_IB) #define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ | RDMA_CORE_CAP_OPA_MAD) #define RDMA_CORE_PORT_RAW_PACKET (RDMA_CORE_CAP_PROT_RAW_PACKET) #define RDMA_CORE_PORT_USNIC (RDMA_CORE_CAP_PROT_USNIC) struct ib_port_attr { u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; u32 phys_mtu; int gid_tbl_len; unsigned int ip_gids:1; /* This is the value from PortInfo CapabilityMask, defined by IBA */ u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u32 sm_lid; u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u16 active_speed; u8 phys_state; u16 port_cap_flags2; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; #define IB_DEVICE_NODE_DESC_MAX 64 struct ib_device_modify { u64 sys_image_guid; char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3), IB_PORT_OPA_MASK_CHG = (1<<4) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; struct ib_wq *wq; u32 port_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { const struct ib_gid_attr *sgid_attr; union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; union rdma_network_hdr { struct ib_grh ibgrh; struct { /* The IB spec states that if it's IPv4, the header * is located in the last 20 bytes of the header. */ u8 reserved[20]; struct iphdr roce4grh; }; }; #define IB_QPN_MASK 0xFFFFFF enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) #define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18, IB_RATE_28_GBPS = 19, IB_RATE_50_GBPS = 20, IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, IB_RATE_800_GBPS = 23, }; /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); /** * enum ib_mr_type - memory region type * @IB_MR_TYPE_MEM_REG: memory region that is used for * normal registration * @IB_MR_TYPE_SG_GAPS: memory region that is capable to * register any arbitrary sg lists (without * the normal mr constraints - see * ib_map_mr_sg) * @IB_MR_TYPE_DM: memory region that is used for device * memory registration * @IB_MR_TYPE_USER: memory region that is used for the user-space * application * @IB_MR_TYPE_DMA: memory region that is used for DMA operations * without address translations (VA=PA) * @IB_MR_TYPE_INTEGRITY: memory region that is used for * data integrity operations */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SG_GAPS, IB_MR_TYPE_DM, IB_MR_TYPE_USER, IB_MR_TYPE_DMA, IB_MR_TYPE_INTEGRITY, }; enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; /** * struct ib_mr_status - Memory region status container * * @fail_status: Bitmask of MR checks status. For each * failed check a corresponding status bit is set. * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS * failure. */ struct ib_mr_status { u32 fail_status; struct ib_sig_err sig_err; }; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct rdma_ah_init_attr { struct rdma_ah_attr *ah_attr; u32 flags; struct net_device *xmit_slave; }; enum rdma_ah_attr_type { RDMA_AH_ATTR_TYPE_UNDEFINED, RDMA_AH_ATTR_TYPE_IB, RDMA_AH_ATTR_TYPE_ROCE, RDMA_AH_ATTR_TYPE_OPA, }; struct ib_ah_attr { u16 dlid; u8 src_path_bits; }; struct roce_ah_attr { u8 dmac[ETH_ALEN]; }; struct opa_ah_attr { u32 dlid; u8 src_path_bits; bool make_grd; }; struct rdma_ah_attr { struct ib_global_route grh; u8 sl; u8 static_rate; u32 port_num; u8 ah_flags; enum rdma_ah_attr_type type; union { struct ib_ah_attr ib; struct roce_ah_attr roce; struct opa_ah_attr opa; }; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); enum ib_wc_opcode { IB_WC_SEND = IB_UVERBS_WC_SEND, IB_WC_RDMA_WRITE = IB_UVERBS_WC_RDMA_WRITE, IB_WC_RDMA_READ = IB_UVERBS_WC_RDMA_READ, IB_WC_COMP_SWAP = IB_UVERBS_WC_COMP_SWAP, IB_WC_FETCH_ADD = IB_UVERBS_WC_FETCH_ADD, IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW, IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV, IB_WC_LSO = IB_UVERBS_WC_TSO, IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, IB_WC_FLUSH = IB_UVERBS_WC_FLUSH, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), IB_WC_WITH_SMAC = (1<<4), IB_WC_WITH_VLAN = (1<<5), IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { union { u64 wr_id; struct ib_cqe *wr_cqe; }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; u32 slid; int wc_flags; u16 pkey_index; u8 sl; u8 dlid_path_bits; u32 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; u8 network_hdr_type; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC, IB_SRQT_XRC = IB_UVERBS_SRQT_XRC, IB_SRQT_TM = IB_UVERBS_SRQT_TM, }; static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) { return srq_type == IB_SRQT_XRC || srq_type == IB_SRQT_TM; } enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; } xrc; struct { u32 max_num_tags; } tag_matching; }; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; /* * Maximum number of rdma_rw_ctx structures in flight at a time. * ib_create_qp() will calculate the right amount of needed WRs * and MRs based on this. */ u32 max_rdma_ctxs; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC = IB_UVERBS_QPT_RC, IB_QPT_UC = IB_UVERBS_QPT_UC, IB_QPT_UD = IB_UVERBS_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET, IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI, IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT, IB_QPT_MAX, IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer */ IB_QPT_RESERVED1 = 0x1000, IB_QPT_RESERVED2, IB_QPT_RESERVED3, IB_QPT_RESERVED4, IB_QPT_RESERVED5, IB_QPT_RESERVED6, IB_QPT_RESERVED7, IB_QPT_RESERVED8, IB_QPT_RESERVED9, IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_INTEGRITY_EN = 1 << 6, IB_QP_CREATE_NETDEV_USE = 1 << 7, IB_QP_CREATE_SCATTER_FCS = IB_UVERBS_QP_CREATE_SCATTER_FCS, IB_QP_CREATE_CVLAN_STRIPPING = IB_UVERBS_QP_CREATE_CVLAN_STRIPPING, IB_QP_CREATE_SOURCE_QPN = 1 << 10, IB_QP_CREATE_PCI_WRITE_END_PADDING = IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. */ struct ib_qp_init_attr { /* This callback occurs in workqueue context */ void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; u32 create_flags; /* * Only needed for special QP types, or when using the RW API. */ u32 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; u32 source_qpn; }; struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), IB_QP_RESERVED1 = (1<<21), IB_QP_RESERVED2 = (1<<22), IB_QP_RESERVED3 = (1<<23), IB_QP_RESERVED4 = (1<<24), IB_QP_RATE_LIMIT = (1<<25), IB_QP_ATTR_STANDARD_BITS = GENMASK(20, 0), }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; enum ib_mw_type { IB_MW_TYPE_1 = 1, IB_MW_TYPE_2 = 2 }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct rdma_ah_attr ah_attr; struct rdma_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u32 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u32 alt_port_num; u8 alt_timeout; u32 rate_limit; struct net_device *xmit_slave; }; enum ib_wr_opcode { /* These are shared with userspace */ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND = IB_UVERBS_WR_SEND, IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, IB_WR_BIND_MW = IB_UVERBS_WR_BIND_MW, IB_WR_LSO = IB_UVERBS_WR_TSO, IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, IB_WR_MASKED_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, IB_WR_FLUSH = IB_UVERBS_WR_FLUSH, IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, IB_WR_REG_MR_INTEGRITY, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ IB_WR_RESERVED1 = 0xf0, IB_WR_RESERVED2, IB_WR_RESERVED3, IB_WR_RESERVED4, IB_WR_RESERVED5, IB_WR_RESERVED6, IB_WR_RESERVED7, IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; struct ib_cqe { void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; }; struct ib_rdma_wr { struct ib_send_wr wr; u64 remote_addr; u32 rkey; }; static inline const struct ib_rdma_wr *rdma_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_rdma_wr, wr); } struct ib_atomic_wr { struct ib_send_wr wr; u64 remote_addr; u64 compare_add; u64 swap; u64 compare_add_mask; u64 swap_mask; u32 rkey; }; static inline const struct ib_atomic_wr *atomic_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_atomic_wr, wr); } struct ib_ud_wr { struct ib_send_wr wr; struct ib_ah *ah; void *header; int hlen; int mss; u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ u32 port_num; /* valid for DR SMPs on switch only */ }; static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_ud_wr, wr); } struct ib_reg_wr { struct ib_send_wr wr; struct ib_mr *mr; u32 key; int access; }; static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_reg_wr, wr); } struct ib_recv_wr { struct ib_recv_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = IB_UVERBS_ACCESS_LOCAL_WRITE, IB_ACCESS_REMOTE_WRITE = IB_UVERBS_ACCESS_REMOTE_WRITE, IB_ACCESS_REMOTE_READ = IB_UVERBS_ACCESS_REMOTE_READ, IB_ACCESS_REMOTE_ATOMIC = IB_UVERBS_ACCESS_REMOTE_ATOMIC, IB_ACCESS_MW_BIND = IB_UVERBS_ACCESS_MW_BIND, IB_ZERO_BASED = IB_UVERBS_ACCESS_ZERO_BASED, IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND, IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB, IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING, IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL, IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT, IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE, IB_ACCESS_SUPPORTED = ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL, }; /* * XXX: these are apparently used for ->rereg_user_mr, no idea why they * are hidden here instead of a uapi header! */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), IB_MR_REREG_ACCESS = (1<<2), IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; struct ib_umem; enum rdma_remove_reason { /* * Userspace requested uobject deletion or initial try * to remove uobject via cleanup. Call could fail */ RDMA_REMOVE_DESTROY, /* Context deletion. This call should delete the actual object itself */ RDMA_REMOVE_CLOSE, /* Driver is being hot-unplugged. This call should delete the actual object itself */ RDMA_REMOVE_DRIVER_REMOVE, /* uobj is being cleaned-up before being committed */ RDMA_REMOVE_ABORT, /* The driver failed to destroy the uobject and is being disconnected */ RDMA_REMOVE_DRIVER_FAILURE, }; struct ib_rdmacg_object { #ifdef CONFIG_CGROUP_RDMA struct rdma_cgroup *cg; /* owner rdma cgroup */ #endif }; struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; struct ib_rdmacg_object cg_obj; u64 enabled_caps; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; struct xarray mmap_xa; }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ /* ufile & ucontext owning this object */ struct ib_uverbs_file *ufile; /* FIXME, save memory: ufile->context == context */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ struct ib_rdmacg_object cg_obj; /* rdmacg object */ int id; /* index into kernel idr */ struct kref ref; atomic_t usecnt; /* protects exclusive access */ struct rcu_head rcu; /* kfree_rcu() overhead */ const struct uverbs_api_object *uapi_object; }; struct ib_udata { const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { u32 local_dma_lkey; u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ u32 unsafe_global_rkey; /* * Implementation details of the RDMA core, don't use in drivers: */ struct ib_mr *__internal_mr; struct rdma_restrack_entry res; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct rw_semaphore tgt_qps_rwsem; struct xarray tgt_qps; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; const struct ib_gid_attr *sgid_attr; enum rdma_ah_attr_type type; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { IB_POLL_SOFTIRQ, /* poll from softirq context */ IB_POLL_WORKQUEUE, /* poll from workqueue */ IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE, IB_POLL_DIRECT, /* caller context, no hw completions */ }; struct ib_cq { struct ib_device *device; struct ib_ucq_object *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; unsigned int cqe_used; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; struct ib_wc *wc; struct list_head pool_entry; union { struct irq_poll iop; struct work_struct work; }; struct workqueue_struct *comp_wq; struct dim *dim; /* updated only by trace points */ ktime_t timestamp; u8 interrupt:1; u8 shared:1; unsigned int comp_vector; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_usrq_object *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; u32 srq_num; } xrc; }; } ext; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; enum ib_raw_packet_caps { /* * Strip cvlan from incoming packet and report it in the matching work * completion is supported. */ IB_RAW_PACKET_CAP_CVLAN_STRIPPING = IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING, /* * Scatter FCS field of an incoming packet to host memory is supported. */ IB_RAW_PACKET_CAP_SCATTER_FCS = IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS, /* Checksum offloads are supported (for both send and receive). */ IB_RAW_PACKET_CAP_IP_CSUM = IB_UVERBS_RAW_PACKET_CAP_IP_CSUM, /* * When a packet is received for an RQ with no receive WQEs, the * packet processing is delayed. */ IB_RAW_PACKET_CAP_DELAY_DROP = IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP, }; enum ib_wq_type { IB_WQT_RQ = IB_UVERBS_WQT_RQ, }; enum ib_wq_state { IB_WQS_RESET, IB_WQS_RDY, IB_WQS_ERR }; struct ib_wq { struct ib_device *device; struct ib_uwq_object *uobject; void *wq_context; void (*event_handler)(struct ib_event *, void *); struct ib_pd *pd; struct ib_cq *cq; u32 wq_num; enum ib_wq_state state; enum ib_wq_type wq_type; atomic_t usecnt; }; enum ib_wq_flags { IB_WQ_FLAGS_CVLAN_STRIPPING = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING, IB_WQ_FLAGS_SCATTER_FCS = IB_UVERBS_WQ_FLAGS_SCATTER_FCS, IB_WQ_FLAGS_DELAY_DROP = IB_UVERBS_WQ_FLAGS_DELAY_DROP, IB_WQ_FLAGS_PCI_WRITE_END_PADDING = IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING, }; struct ib_wq_init_attr { void *wq_context; enum ib_wq_type wq_type; u32 max_wr; u32 max_sge; struct ib_cq *cq; void (*event_handler)(struct ib_event *, void *); u32 create_flags; /* Use enum ib_wq_flags */ }; enum ib_wq_attr_mask { IB_WQ_STATE = 1 << 0, IB_WQ_CUR_STATE = 1 << 1, IB_WQ_FLAGS = 1 << 2, }; struct ib_wq_attr { enum ib_wq_state wq_state; enum ib_wq_state curr_wq_state; u32 flags; /* Use enum ib_wq_flags */ u32 flags_mask; /* Use enum ib_wq_flags */ }; struct ib_rwq_ind_table { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; u32 ind_tbl_num; u32 log_ind_tbl_size; struct ib_wq **ind_tbl; }; struct ib_rwq_ind_table_init_attr { u32 log_ind_tbl_size; /* Each entry is a pointer to Receive Work Queue */ struct ib_wq **ind_tbl; }; enum port_pkey_state { IB_PORT_PKEY_NOT_VALID = 0, IB_PORT_PKEY_VALID = 1, IB_PORT_PKEY_LISTED = 2, }; struct ib_qp_security; struct ib_port_pkey { enum port_pkey_state state; u16 pkey_index; u32 port_num; struct list_head qp_list; struct list_head to_error_list; struct ib_qp_security *sec; }; struct ib_ports_pkeys { struct ib_port_pkey main; struct ib_port_pkey alt; }; struct ib_qp_security { struct ib_qp *qp; struct ib_device *dev; /* Hold this mutex when changing port and pkey settings. */ struct mutex mutex; struct ib_ports_pkeys *ports_pkeys; /* A list of all open shared QP handles. Required to enforce security * properly for all users of a shared QP. */ struct list_head shared_qp_list; void *security; bool destroying; atomic_t error_list_count; struct completion error_complete; int error_comps_pending; }; /* * @max_write_sge: Maximum SGE elements per RDMA WRITE request. * @max_read_sge: Maximum SGE elements per RDMA READ request. */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; spinlock_t mr_lock; int mrs_used; struct list_head rdma_mrs; struct list_head sig_mrs; struct ib_srq *srq; struct completion srq_completion; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uqp_object *uobject; void (*event_handler)(struct ib_event *, void *); void (*registered_event_handler)(struct ib_event *, void *); void *qp_context; /* sgid_attrs associated with the AV's */ const struct ib_gid_attr *av_sgid_attr; const struct ib_gid_attr *alt_path_sgid_attr; u32 qp_num; u32 max_write_sge; u32 max_read_sge; enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; u32 port; bool integrity_en; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; /* The counter the qp is bind to */ struct rdma_counter *counter; }; struct ib_dm { struct ib_device *device; u32 length; u32 flags; struct ib_uobject *uobject; atomic_t usecnt; }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; u32 lkey; u32 rkey; u64 iova; u64 length; unsigned int page_size; enum ib_mr_type type; bool need_inval; union { struct ib_uobject *uobject; /* user */ struct list_head qp_entry; /* FR */ }; struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; enum ib_mw_type type; }; /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ IB_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IB_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IB_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IB_FLOW_ATTR_SNIFFER = 0x3 }; /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, IB_FLOW_SPEC_IPV6 = 0x31, IB_FLOW_SPEC_ESP = 0x34, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, IB_FLOW_SPEC_GRE = 0x51, IB_FLOW_SPEC_MPLS = 0x60, IB_FLOW_SPEC_INNER = 0x100, /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, IB_FLOW_SPEC_ACTION_DROP = 0x1001, IB_FLOW_SPEC_ACTION_HANDLE = 0x1002, IB_FLOW_SPEC_ACTION_COUNT = 0x1003, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 10 enum ib_flow_flags { IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 3 /* Must be last */ }; struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; }; struct ib_flow_spec_eth { u32 type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; }; struct ib_flow_ib_filter { __be16 dlid; __u8 sl; }; struct ib_flow_spec_ib { u32 type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; }; /* IPv4 header flags */ enum ib_ipv4_flags { IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the last have this flag set */ }; struct ib_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; u8 proto; u8 tos; u8 ttl; u8 flags; }; struct ib_flow_spec_ipv4 { u32 type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; }; struct ib_flow_ipv6_filter { u8 src_ip[16]; u8 dst_ip[16]; __be32 flow_label; u8 next_hdr; u8 traffic_class; u8 hop_limit; } __packed; struct ib_flow_spec_ipv6 { u32 type; u16 size; struct ib_flow_ipv6_filter val; struct ib_flow_ipv6_filter mask; }; struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; }; struct ib_flow_spec_tcp_udp { u32 type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; }; struct ib_flow_tunnel_filter { __be32 tunnel_id; }; /* ib_flow_spec_tunnel describes the Vxlan tunnel * the tunnel_id from val has the vni value */ struct ib_flow_spec_tunnel { u32 type; u16 size; struct ib_flow_tunnel_filter val; struct ib_flow_tunnel_filter mask; }; struct ib_flow_esp_filter { __be32 spi; __be32 seq; }; struct ib_flow_spec_esp { u32 type; u16 size; struct ib_flow_esp_filter val; struct ib_flow_esp_filter mask; }; struct ib_flow_gre_filter { __be16 c_ks_res0_ver; __be16 protocol; __be32 key; }; struct ib_flow_spec_gre { u32 type; u16 size; struct ib_flow_gre_filter val; struct ib_flow_gre_filter mask; }; struct ib_flow_mpls_filter { __be32 tag; }; struct ib_flow_spec_mpls { u32 type; u16 size; struct ib_flow_mpls_filter val; struct ib_flow_mpls_filter mask; }; struct ib_flow_spec_action_tag { enum ib_flow_spec_type type; u16 size; u32 tag_id; }; struct ib_flow_spec_action_drop { enum ib_flow_spec_type type; u16 size; }; struct ib_flow_spec_action_handle { enum ib_flow_spec_type type; u16 size; struct ib_flow_action *act; }; enum ib_counters_description { IB_COUNTER_PACKETS, IB_COUNTER_BYTES, }; struct ib_flow_spec_action_count { enum ib_flow_spec_type type; u16 size; struct ib_counters *counters; }; union ib_flow_spec { struct { u32 type; u16 size; }; struct ib_flow_spec_eth eth; struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_esp esp; struct ib_flow_spec_gre gre; struct ib_flow_spec_mpls mpls; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; struct ib_flow_spec_action_handle action; struct ib_flow_spec_action_count flow_count; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; u32 flags; u8 num_of_specs; u32 port; union ib_flow_spec flows[]; }; struct ib_flow { struct ib_qp *qp; struct ib_device *device; struct ib_uobject *uobject; }; enum ib_flow_action_type { IB_FLOW_ACTION_UNSPECIFIED, IB_FLOW_ACTION_ESP = 1, }; struct ib_flow_action_attrs_esp_keymats { enum ib_uverbs_flow_action_esp_keymat protocol; union { struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm; } keymat; }; struct ib_flow_action_attrs_esp_replays { enum ib_uverbs_flow_action_esp_replay protocol; union { struct ib_uverbs_flow_action_esp_replay_bmp bmp; } replay; }; enum ib_flow_action_attrs_esp_flags { /* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags * This is done in order to share the same flags between user-space and * kernel and spare an unnecessary translation. */ /* Kernel flags */ IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED = 1ULL << 32, IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS = 1ULL << 33, }; struct ib_flow_spec_list { struct ib_flow_spec_list *next; union ib_flow_spec spec; }; struct ib_flow_action_attrs_esp { struct ib_flow_action_attrs_esp_keymats *keymat; struct ib_flow_action_attrs_esp_replays *replay; struct ib_flow_spec_list *encap; /* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled. * Value of 0 is a valid value. */ u32 esn; u32 spi; u32 seq; u32 tfc_pad; /* Use enum ib_flow_action_attrs_esp_flags */ u64 flags; u64 hard_limit_pkts; }; struct ib_flow_action { struct ib_device *device; struct ib_uobject *uobject; enum ib_flow_action_type type; atomic_t usecnt; }; struct ib_mad; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; struct ib_port_cache { u64 subnet_prefix; struct ib_pkey_cache *pkey; struct ib_gid_table *gid; u8 lmc; enum ib_port_state port_state; enum ib_port_state last_port_state; }; struct ib_port_immutable { int pkey_tbl_len; int gid_tbl_len; u32 core_cap_flags; u32 max_mad_size; }; struct ib_port_data { struct ib_device *ib_dev; struct ib_port_immutable immutable; spinlock_t pkey_list_lock; spinlock_t netdev_lock; struct list_head pkey_list; struct ib_port_cache cache; struct net_device __rcu *netdev; netdevice_tracker netdev_tracker; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; struct ib_port *sysfs; }; /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { RDMA_NETDEV_OPA_VNIC, RDMA_NETDEV_IPOIB, }; /** * struct rdma_netdev - rdma netdev * For cases where netstack interfacing is required. */ struct rdma_netdev { void *clnt_priv; struct ib_device *hca; u32 port_num; int mtu; /* * cleanup function must be specified. * FIXME: This is only used for OPA_VNIC and that usage should be * removed too. */ void (*free_rdma_netdev)(struct net_device *netdev); /* control functions */ void (*set_id)(struct net_device *netdev, int id); /* send packet */ int (*send)(struct net_device *dev, struct sk_buff *skb, struct ib_ah *address, u32 dqpn); /* multicast */ int (*attach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid, int set_qkey, u32 qkey); int (*detach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid); /* timeout */ void (*tx_timeout)(struct net_device *dev, unsigned int txqueue); }; struct rdma_netdev_alloc_params { size_t sizeof_priv; unsigned int txqs; unsigned int rxqs; void *param; int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num, struct net_device *netdev, void *param); }; struct ib_odp_counters { atomic64_t faults; atomic64_t faults_handled; atomic64_t invalidations; atomic64_t invalidations_handled; atomic64_t prefetch; }; struct ib_counters { struct ib_device *device; struct ib_uobject *uobject; /* num of objects attached */ atomic_t usecnt; }; struct ib_counters_read_attr { u64 *counters_buff; u32 ncounters; u32 flags; /* use enum ib_read_counters_flags */ }; struct uverbs_attr_bundle; struct iw_cm_id; struct iw_cm_conn_param; #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ .size_##ib_struct = \ (sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ BUILD_BUG_ON_ZERO( \ !__same_type(((struct drv_struct *)NULL)->member, \ struct ib_struct))) #define rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, gfp) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ gfp, false)) #define rdma_zalloc_drv_obj_numa(ib_dev, ib_type) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ GFP_KERNEL, true)) #define rdma_zalloc_drv_obj(ib_dev, ib_type) \ rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, GFP_KERNEL) #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct struct rdma_user_mmap_entry { struct kref ref; struct ib_ucontext *ucontext; unsigned long start_pgoff; size_t npages; bool driver_removed; }; /* Return the offset (in bytes) the user should pass to libc's mmap() */ static inline u64 rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry) { return (u64)entry->start_pgoff << PAGE_SHIFT; } /** * struct ib_device_ops - InfiniBand device operations * This structure defines all the InfiniBand device operations, providers will * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; unsigned int uverbs_no_driver_id_binding:1; /* * NOTE: New drivers should not make use of device_group; instead new * device parameter should be exposed via netlink command. This * mechanism exists only for existing drivers. */ const struct attribute_group *device_group; const struct attribute_group **port_groups; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); void (*drain_rq)(struct ib_qp *qp); void (*drain_sq)(struct ib_qp *qp); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*post_srq_recv)(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int (*process_mad)(struct ib_device *device, int process_mad_flags, u32 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad *in_mad, struct ib_mad *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr, struct ib_udata *udata); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); void (*get_dev_fw_str)(struct ib_device *device, char *str); const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device * in fast paths. */ int (*get_port_immutable)(struct ib_device *device, u32 port_num, struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u32 port_num); /** * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold * on this net device. The HW vendor's device driver must guarantee * that this function returns NULL before the net device has finished * NETDEV_UNREGISTER state. */ struct net_device *(*get_netdev)(struct ib_device *device, u32 port_num); /** * rdma netdev operation * * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params * must return -EOPNOTSUPP if it doesn't support the specified type. */ struct net_device *(*alloc_rdma_netdev)( struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); /** * query_gid should be return GID value for @device, when @port_num * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); /** * When calling add_gid, the HW vendor's driver should add the gid * of device of port at gid index available at @attr. Meta-info of * that gid (for example, the network device related to this gid) is * available at @attr. @context allows the HW vendor driver to store * extra information together with a GID entry. The HW vendor driver may * allocate memory to contain this information and store it in @context * when a new GID entry is written to. Params are consistent until the * next call of add_gid or delete_gid. The function should return 0 on * success or error otherwise. The function could be called * concurrently for different ports. This function is only called when * roce_gid_table is used. */ int (*add_gid)(const struct ib_gid_attr *attr, void **context); /** * When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index gid_index of port port_num * available in @attr. * Upon the deletion of a GID entry, the HW vendor must free any * allocated memory. The caller will clear @context afterwards. * This function is only called when roce_gid_table is used. */ int (*del_gid)(const struct ib_gid_attr *attr, void **context); int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int (*alloc_ucontext)(struct ib_ucontext *context, struct ib_udata *udata); void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); /** * This will be called once refcount of an entry in mmap_xa reaches * zero. The type of the memory that was mapped may differ between * entries and is opaque to the rdma_user_mmap interface. * Therefore needs to be implemented by the driver in mmap_free. */ void (*mmap_free)(struct rdma_user_mmap_entry *entry); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*create_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*create_user_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah, u32 flags); int (*create_srq)(struct ib_srq *srq, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq, struct ib_udata *udata); int (*create_qp)(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int mr_access_flags, struct uverbs_attr_bundle *attrs); struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata); struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); int (*advise_mr)(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge, struct uverbs_attr_bundle *attrs); /* * Kernel users should universally support relaxed ordering (RO), as * they are designed to read data only after observing the CQE and use * the DMA API correctly. * * Some drivers implicitly enable RO if platform supports it. */ int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); int (*alloc_mw)(struct ib_mw *mw, struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); struct ib_flow *(*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, struct ib_udata *udata); int (*destroy_flow)(struct ib_flow *flow_id); int (*destroy_flow_action)(struct ib_flow_action *action); int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port, int state); int (*get_vf_config)(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *ivf); int (*get_vf_stats)(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int (*get_vf_guid)(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid, int type); struct ib_wq *(*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata); int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask, struct ib_udata *udata); int (*create_rwq_ind_table)(struct ib_rwq_ind_table *ib_rwq_ind_table, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dm *(*alloc_dm)(struct ib_device *device, struct ib_ucontext *context, struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); int (*create_counters)(struct ib_counters *counters, struct uverbs_attr_bundle *attrs); int (*destroy_counters)(struct ib_counters *counters); int (*read_counters)(struct ib_counters *counters, struct ib_counters_read_attr *counters_read_attr, struct uverbs_attr_bundle *attrs); int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset); /** * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and * fill in the driver initialized data. The struct is kfree()'ed by * the sysfs core when the device is removed. A lifespan of -1 in the * return struct tells the core to set a default lifespan. */ struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device); struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device, u32 port_num); /** * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated * Return codes - * < 0 - Error, no counters updated * index - Updated the single counter pointed to by index * num_counters - Updated all counters (will reset the timestamp * and prevent further calls for lifespan milliseconds) * Drivers are allowed to update all counters in leiu of just the * one given in index at their option */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u32 port, int index); /** * modify_hw_stat - Modify the counter configuration * @enable: true/false when enable/disable a counter * Return codes - 0 on success or error code otherwise. */ int (*modify_hw_stat)(struct ib_device *device, u32 port, unsigned int counter_index, bool enable); /** * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id); int (*fill_res_srq_entry)(struct sk_buff *msg, struct ib_srq *ib_srq); int (*fill_res_srq_entry_raw)(struct sk_buff *msg, struct ib_srq *ib_srq); /* Device lifecycle callbacks */ /* * Called after the device becomes registered, before clients are * attached */ int (*enable_driver)(struct ib_device *dev); /* * This is called as part of ib_dealloc_device(). */ void (*dealloc_driver)(struct ib_device *dev); /* iWarp CM callbacks */ void (*iw_add_ref)(struct ib_qp *qp); void (*iw_rem_ref)(struct ib_qp *qp); struct ib_qp *(*iw_get_qp)(struct ib_device *device, int qpn); int (*iw_connect)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_accept)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); /** * counter_bind_qp - Bind a QP to a counter. * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp, u32 port); /** * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ int (*counter_unbind_qp)(struct ib_qp *qp, u32 port); /** * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); /** * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in * the driver initialized data. */ struct rdma_hw_stats *(*counter_alloc_stats)( struct rdma_counter *counter); /** * counter_update_stats - Query the stats value of this counter */ int (*counter_update_stats)(struct rdma_counter *counter); /** * counter_init - Initialize the driver specific rdma counter struct. */ void (*counter_init)(struct rdma_counter *counter); /** * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. */ int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); /* query driver for its ucontext properties */ int (*query_ucontext)(struct ib_ucontext *context, struct uverbs_attr_bundle *attrs); /* * Provide NUMA node. This API exists for rdmavt/hfi1 only. * Everyone else relies on Linux memory management model. */ int (*get_numa_node)(struct ib_device *dev); /** * add_sub_dev - Add a sub IB device */ struct ib_device *(*add_sub_dev)(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); /** * del_sub_dev - Delete a sub IB device */ void (*del_sub_dev)(struct ib_device *sub_dev); /** * ufile_cleanup - Attempt to cleanup ubojects HW resources inside * the ufile. */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); /** * report_port_event - Drivers need to implement this if they have * some private stuff to handle when link status changes. */ void (*report_port_event)(struct ib_device *ibdev, struct net_device *ndev, unsigned long event); DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_mw); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_qp); DECLARE_RDMA_OBJ_SIZE(ib_rwq_ind_table); DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); DECLARE_RDMA_OBJ_SIZE(ib_xrcd); DECLARE_RDMA_OBJ_SIZE(rdma_counter); }; struct ib_core_device { /* device must be the first element in structure until, * union of ib_core_device and device exists in ib_device. */ struct device dev; possible_net_t rdma_net; struct kobject *ports_kobj; struct list_head port_list; struct ib_device *owner; /* reach back to owner ib_device */ }; struct rdma_restrack_root; struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; struct ib_device_ops ops; char name[IB_DEVICE_NAME_MAX]; struct rcu_head rcu_head; struct list_head event_handler_list; /* Protects event_handler_list */ struct rw_semaphore event_handler_rwsem; /* Protects QP's event_handler calls and open_qp list */ spinlock_t qp_open_list_lock; struct rw_semaphore client_data_rwsem; struct xarray client_data; struct mutex unregistration_lock; /* Synchronize GID, Pkey cache entries, subnet prefix, LMC */ rwlock_t cache_lock; /** * port_data is indexed by port number */ struct ib_port_data *port_data; int num_comp_vectors; union { struct device dev; struct ib_core_device coredev; }; /* First group is for device attributes, * Second group is for driver provided attributes (optional). * Third group is for the hw_stats * It is a NULL terminated array. */ const struct attribute_group *groups[4]; u8 hw_stats_attr_index; u64 uverbs_cmd_mask; char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; /* Indicates kernel verbs support, should not be used in drivers */ u16 kverbs_provider:1; /* CQ adaptive moderation (RDMA DIM) */ u16 use_cq_dim:1; u8 node_type; u32 phys_port_cnt; struct ib_device_attr attrs; struct hw_stats_device_data *hw_stats_data; #ifdef CONFIG_CGROUP_RDMA struct rdmacg_device cg_device; #endif u32 index; spinlock_t cq_pools_lock; struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1]; struct rdma_restrack_root *res; const struct uapi_definition *driver_def; /* * Positive refcount indicates that the device is currently * registered and cannot be unregistered. */ refcount_t refcount; struct completion unreg_completion; struct work_struct unregistration_work; const struct rdma_link_ops *link_ops; /* Protects compat_devs xarray modifications */ struct mutex compat_devs_mutex; /* Maintains compat devices for each net namespace */ struct xarray compat_devs; /* Used by iWarp CM */ char iw_ifname[IFNAMSIZ]; u32 iw_driver_flags; u32 lag_flags; /* A parent device has a list of sub-devices */ struct mutex subdev_lock; struct list_head subdev_list_head; /* A sub device has a type and a parent */ enum rdma_nl_dev_type type; struct ib_device *parent; struct list_head subdev_list; enum rdma_nl_name_assign_type name_assign_type; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, gfp_t gfp, bool is_numa_aware) { if (is_numa_aware && dev->ops.get_numa_node) return kzalloc_node(size, gfp, dev->ops.get_numa_node(dev)); return kzalloc(size, gfp); } struct ib_client_nl_info; struct ib_client { const char *name; int (*add)(struct ib_device *ibdev); void (*remove)(struct ib_device *, void *client_data); void (*rename)(struct ib_device *dev, void *client_data); int (*get_nl_info)(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res); int (*get_global_nl_info)(struct ib_client_nl_info *res); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. * @dev: An RDMA device that the net_dev use for communication. * @port: A physical port number on the RDMA device. * @pkey: P_Key that the net_dev uses if applicable. * @gid: A GID that the net_dev uses to communicate. * @addr: An IP address the net_dev is configured with. * @client_data: The device's client data set by ib_set_client_data(). * * An ib_client that implements a net_dev on top of RDMA devices * (such as IP over IB) should implement this callback, allowing the * rdma_cm module to find the right net_dev for a given request. * * The caller is responsible for calling dev_put on the returned * netdev. */ struct net_device *(*get_net_dev_by_params)( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); refcount_t uses; struct completion uses_zero; u32 client_id; /* kverbs are not required by the client */ u8 no_kverbs_req:1; }; /* * IB block DMA iterator * * Iterates the DMA-mapped SGL in contiguous memory blocks aligned * to a HW supported page size. */ struct ib_block_iter { /* internal states */ struct scatterlist *__sg; /* sg holding the current aligned block */ dma_addr_t __dma_addr; /* unaligned DMA address of this block */ size_t __sg_numblocks; /* ib_umem_num_dma_blocks() */ unsigned int __sg_nents; /* number of SG entries */ unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ unsigned int __pg_bit; /* alignment of current block */ }; struct ib_device *_ib_alloc_device(size_t size); #define ib_alloc_device(drv_struct, member) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ struct drv_struct, member))), \ struct drv_struct, member) void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device); void ib_unregister_device(struct ib_device *device); void ib_unregister_driver(enum rdma_driver_id driver_id); void ib_unregister_device_and_put(struct ib_device *device); void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void __rdma_block_iter_start(struct ib_block_iter *biter, struct scatterlist *sglist, unsigned int nents, unsigned long pgsz); bool __rdma_block_iter_next(struct ib_block_iter *biter); /** * rdma_block_iter_dma_address - get the aligned dma address of the current * block held by the block iterator. * @biter: block iterator holding the memory block */ static inline dma_addr_t rdma_block_iter_dma_address(struct ib_block_iter *biter) { return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); } /** * rdma_for_each_block - iterate over contiguous memory blocks of the sg list * @sglist: sglist to iterate over * @biter: block iterator holding the memory block * @nents: maximum number of sg entries to iterate over * @pgsz: best HW supported page size to use * * Callers may use rdma_block_iter_dma_address() to get each * blocks aligned DMA address. */ #define rdma_for_each_block(sglist, biter, nents, pgsz) \ for (__rdma_block_iter_start(biter, sglist, nents, \ pgsz); \ __rdma_block_iter_next(biter);) /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns the client context data set with * ib_set_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ static inline void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { return xa_load(&device->client_data, client->client_id); } void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); void ib_set_device_ops(struct ib_device *device, const struct ib_device_ops *ops); int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, unsigned long pfn, unsigned long size, pgprot_t prot, struct rdma_user_mmap_entry *entry); int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length); int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 min_pgoff, u32 max_pgoff); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) void rdma_user_mmap_disassociate(struct ib_device *device); #else static inline void rdma_user_mmap_disassociate(struct ib_device *device) { } #endif static inline int rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 pgoff) { return rdma_user_mmap_entry_insert_range(ucontext, entry, length, pgoff, pgoff); } struct rdma_user_mmap_entry * rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, unsigned long pgoff); struct rdma_user_mmap_entry * rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, struct vm_area_struct *vma); void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry); void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } static inline bool ib_is_buffer_cleared(const void __user *p, size_t len) { bool ret; u8 *buf; if (len > USHRT_MAX) return false; buf = memdup_user(p, len); if (IS_ERR(buf)) return false; ret = !memchr_inv(buf, 0, len); kfree(buf); return ret; } static inline bool ib_is_udata_cleared(struct ib_udata *udata, size_t offset, size_t len) { return ib_is_buffer_cleared(udata->inbuf + offset, len); } /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(const struct ib_event *event); int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u32 port_num); /** * rdma_cap_ib_switch - Check if the device is IB switch * @device: Device to check * * Device driver is responsible for setting is_switch bit on * in ib_device structure at init time. * * Return: true if the device is IB switch. */ static inline bool rdma_cap_ib_switch(const struct ib_device *device) { return device->is_switch; } /** * rdma_start_port - Return the first valid port number for the device * specified * * @device: Device to be checked * * Return start port number */ static inline u32 rdma_start_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : 1; } /** * rdma_for_each_port - Iterate over all valid port numbers of the IB device * @device - The struct ib_device * to iterate over * @iter - The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ for (iter = rdma_start_port(device + \ BUILD_BUG_ON_ZERO(!__same_type(u32, \ iter))); \ iter <= rdma_end_port(device); iter++) /** * rdma_end_port - Return the last valid port number for the device * specified * * @device: Device to be checked * * Return last port number */ static inline u32 rdma_end_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } static inline int rdma_is_port_valid(const struct ib_device *device, unsigned int port) { return (port >= rdma_start_port(device) && port <= rdma_end_port(device)); } static inline bool rdma_is_grh_required(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_IB_GRH_REQUIRED; } static inline bool rdma_protocol_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u32 port_num) { return rdma_protocol_ib(device, port_num) || rdma_protocol_roce(device, port_num); } static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET; } static inline bool rdma_protocol_usnic(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_USNIC; } /** * rdma_cap_ib_mad - Check if the port of a device supports Infiniband * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Management Datagrams (MAD) are a required part of the InfiniBand * specification and are supported on all InfiniBand devices. A slightly * extended version are also supported on OPA interfaces. * * Return: true if the port supports sending/receiving of MAD packets. */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_MAD; } /** * rdma_cap_opa_mad - Check if the port of device provides support for OPA * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Intel OmniPath devices extend and/or replace the InfiniBand Management * datagrams with their own versions. These OPA MADs share many but not all of * the characteristics of InfiniBand MADs. * * OPA MADs differ in the following ways: * * 1) MADs are variable size up to 2K * IBTA defined MADs remain fixed at 256 bytes * 2) OPA SMPs must carry valid PKeys * 3) OPA SMP packets are a different format * * Return: true if the port supports OPA MAD packet formats. */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_MAD; } /** * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). * @device: Device to check * @port_num: Port number to check * * Each InfiniBand node is required to provide a Subnet Management Agent * that the subnet manager can access. Prior to the fabric being fully * configured by the subnet manager, the SMA is accessed via a well known * interface called the Subnet Management Interface (SMI). This interface * uses directed route packets to communicate with the SM to get around the * chicken and egg problem of the SM needing to know what's on the fabric * in order to configure the fabric, and needing to configure the fabric in * order to send packets to the devices on the fabric. These directed * route packets do not need the fabric fully configured in order to reach * their destination. The SMI is the only method allowed to send * directed route packets on an InfiniBand fabric. * * Return: true if the port provides an SMI. */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SMI; } /** * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband * Communication Manager. * @device: Device to check * @port_num: Port number to check * * The InfiniBand Communication Manager is one of many pre-defined General * Service Agents (GSA) that are accessed via the General Service * Interface (GSI). It's role is to facilitate establishment of connections * between nodes as well as other management related tasks for established * connections. * * Return: true if the port supports an IB CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_CM; } /** * rdma_cap_iw_cm - Check if the port of device has the capability IWARP * Communication Manager. * @device: Device to check * @port_num: Port number to check * * Similar to above, but specific to iWARP connections which have a different * managment protocol than InfiniBand. * * Return: true if the port supports an iWARP CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IW_CM; } /** * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband * Subnet Administration. * @device: Device to check * @port_num: Port number to check * * An InfiniBand Subnet Administration (SA) service is a pre-defined General * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand * fabrics, devices should resolve routes to other hosts by contacting the * SA to query the proper route. * * Return: true if the port should act as a client to the fabric Subnet * Administration interface. This does not imply that the SA service is * running locally. */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SA; } /** * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband * Multicast. * @device: Device to check * @port_num: Port number to check * * InfiniBand multicast registration is more complex than normal IPv4 or * IPv6 multicast registration. Each Host Channel Adapter must register * with the Subnet Manager when it wishes to join a multicast group. It * should do so only once regardless of how many queue pairs it subscribes * to this group. And it should leave the group only after all queue pairs * attached to the group have been detached. * * Return: true if the port must undertake the additional adminstrative * overhead of registering/unregistering with the SM and tracking of the * total number of queue pairs attached to the multicast group. */ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u32 port_num) { return rdma_cap_ib_sa(device, port_num); } /** * rdma_cap_af_ib - Check if the port of device has the capability * Native Infiniband Address. * @device: Device to check * @port_num: Port number to check * * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default * GID. RoCE uses a different mechanism, but still generates a GID via * a prescribed mechanism and port specific data. * * Return: true if the port uses a GID address to identify devices on the * network. */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_AF_IB; } /** * rdma_cap_eth_ah - Check if the port of device has the capability * Ethernet Address Handle. * @device: Device to check * @port_num: Port number to check * * RoCE is InfiniBand over Ethernet, and it uses a well defined technique * to fabricate GIDs over Ethernet/IP specific addresses native to the * port. Normally, packet headers are generated by the sending host * adapter, but when sending connectionless datagrams, we must manually * inject the proper headers for the fabric we are communicating over. * * Return: true if we are running as a RoCE port and must force the * addition of a Global Route Header built from our Ethernet Address * Handle into our header list for connectionless packets. */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_ETH_AH; } /** * rdma_cap_opa_ah - Check if the port of device supports * OPA Address handles * @device: Device to check * @port_num: Port number to check * * Return: true if we are running on an OPA device which supports * the extended OPA addressing. */ static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; } /** * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. * * @device: Device * @port_num: Port number * * This MAD size includes the MAD headers and MAD payload. No other headers * are included. * * Return the max MAD size required by the Port. Will return 0 if the port * does not support MADs */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.max_mad_size; } /** * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table * @device: Device to check * @port_num: Port number to check * * RoCE GID table mechanism manages the various GIDs for a device. * * NOTE: if allocating the port's GID table has failed, this call will still * return true, but any RoCE GID table API will fail. * * Return: true if the port uses RoCE GID table mechanism in order to manage * its GIDs. */ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, u32 port_num) { return rdma_protocol_roce(device, port_num) && device->ops.add_gid && device->ops.del_gid; } /* * Check if the device supports READ W/ INVALIDATE. */ static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) { /* * iWarp drivers must support READ W/ INVALIDATE. No other protocol * has support for it yet. */ return rdma_protocol_iwarp(dev, port_num); } /** * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not. * @device: Device * @port_num: 1 based Port number * * Return true if port is an Intel OPA port , false if not */ static inline bool rdma_core_cap_opa_port(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_INTEL_OPA) == RDMA_CORE_PORT_INTEL_OPA; } /** * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. * @device: Device * @port_num: Port number * @mtu: enum value of MTU * * Return the MTU size supported by the port as an integer value. Will return * -1 if enum value of mtu is not supported. */ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, int mtu) { if (rdma_core_cap_opa_port(device, port)) return opa_mtu_enum_to_int((enum opa_mtu)mtu); else return ib_mtu_enum_to_int((enum ib_mtu)mtu); } /** * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. * @device: Device * @port_num: Port number * @attr: port attribute * * Return the MTU size supported by the port as an integer value. */ static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port, struct ib_port_attr *attr) { if (rdma_core_cap_opa_port(device, port)) return attr->phys_mtu; else return ib_mtu_enum_to_int(attr->max_mtu); } int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state); int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info); int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type); int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index); enum ib_pd_flags { /* * Create a memory registration for all memory in the system and place * the rkey for it into pd->unsafe_global_rkey. This can be used by * ULPs to avoid the overhead of dynamic MRs. * * This flag is generally considered unsafe and must only be used in * extremly trusted environments. Every use of it will log a warning * in the kernel log. */ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, }; struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller); /** * ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * @flags: protection domain flags * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. * * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ #define ib_alloc_pd(device, flags) \ __ib_alloc_pd((device), (flags), KBUILD_MODNAME) int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata); /** * ib_dealloc_pd - Deallocate kernel PD * @pd: The protection domain * * NOTE: for user PD use ib_dealloc_pd_user with valid udata! */ static inline void ib_dealloc_pd(struct ib_pd *pd) { int ret = ib_dealloc_pd_user(pd, NULL); WARN_ONCE(ret, "Destroy of kernel PD shouldn't fail"); } enum rdma_create_ah_flags { /* In a sleepable context */ RDMA_CREATE_AH_SLEEPABLE = BIT(0), }; /** * rdma_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @flags: Create address handle flags (see enum rdma_create_ah_flags). * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags); /** * rdma_create_user_ah - Creates an address handle for the given address vector. * It resolves destination mac address for ah attribute of RoCE type. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @udata: pointer to user's input output buffer information need by * provider driver. * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata); /** * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header * work completion. * @hdr: the L3 header to parse * @net_type: type of header to parse * @sgid: place to store source gid * @dgid: place to store destination gid */ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, enum rdma_network_type net_type, union ib_gid *sgid, union ib_gid *dgid); /** * ib_get_rdma_header_version - Get the header version * @hdr: the L3 header to parse */ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); /** * ib_init_ah_attr_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. * When ib_init_ah_attr_from_wc() returns success, * (a) for IB link layer it optionally contains a reference to SGID attribute * when GRH is present for IB link layer. * (b) for RoCE link layer it contains a reference to SGID attribute. * User must invoke rdma_cleanup_ah_attr_gid_attr() to release reference to SGID * attributes which are initialized using ib_init_ah_attr_from_wc(). * */ int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u32 port_num); /** * rdma_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); /** * rdma_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); enum rdma_destroy_ah_flags { /* In a sleepable context */ RDMA_DESTROY_AH_SLEEPABLE = BIT(0), }; /** * rdma_destroy_ah_user - Destroys an address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * @udata: Valid user data or NULL for kernel objects */ int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata); /** * rdma_destroy_ah - Destroys an kernel address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * * NOTE: for user ah use rdma_destroy_ah_user with valid udata! */ static inline void rdma_destroy_ah(struct ib_ah *ah, u32 flags) { int ret = rdma_destroy_ah_user(ah, flags, NULL); WARN_ONCE(ret, "Destroy of kernel AH shouldn't fail"); } struct ib_srq *ib_create_srq_user(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_usrq_object *uobject, struct ib_udata *udata); static inline struct ib_srq * ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) { if (!pd->device->ops.create_srq) return ERR_PTR(-EOPNOTSUPP); return ib_create_srq_user(pd, srq_init_attr, NULL, NULL); } /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq_user - Destroys the specified SRQ. * @srq: The SRQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata); /** * ib_destroy_srq - Destroys the specified kernel SRQ. * @srq: The SRQ to destroy. * * NOTE: for user srq use ib_destroy_srq_user with valid udata! */ static inline void ib_destroy_srq(struct ib_srq *srq) { int ret = ib_destroy_srq_user(srq, NULL); WARN_ONCE(ret, "Destroy of kernel SRQ shouldn't fail"); } /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return srq->device->ops.post_srq_recv(srq, recv_wr, bad_recv_wr ? : &dummy); } struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, const char *caller); /** * ib_create_qp - Creates a kernel QP associated with the specific protection * domain. * @pd: The protection domain associated with the QP. * @init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ static inline struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr) { return ib_create_qp_kernel(pd, init_attr, KBUILD_MODNAME); } /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. * @udata: pointer to user's input output buffer information * are being modified. * It returns 0 on success and returns appropriate error code on error. */ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. * @udata: Valid udata or NULL for kernel objects */ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata); /** * ib_destroy_qp - Destroys the specified kernel QP. * @qp: The QP to destroy. * * NOTE: for user qp use ib_destroy_qp_user with valid udata! */ static inline int ib_destroy_qp(struct ib_qp *qp) { return ib_destroy_qp_user(qp, NULL); } /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd - XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr) { const struct ib_send_wr *dummy; return qp->device->ops.post_send(qp, send_wr, bad_send_wr ? : &dummy); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy); } struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, const char *caller); static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) { return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, KBUILD_MODNAME); } struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx, const char *caller); /** * ib_alloc_cq_any: Allocate kernel CQ * @dev: The IB device * @private: Private data attached to the CQE * @nr_cqe: Number of CQEs in the CQ * @poll_ctx: Context used for polling the CQ */ static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx) { return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx, KBUILD_MODNAME); } void ib_free_cq(struct ib_cq *cq); int ib_process_cq_direct(struct ib_cq *cq, int budget); /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *__ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr, const char *caller); #define ib_create_cq(device, cmp_hndlr, evt_hndlr, cq_ctxt, cq_attr) \ __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME) /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** * rdma_set_cq_moderation - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event * @cq_period: max period of time in usec before triggering an event * */ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq_user - Destroys the specified CQ. * @cq: The CQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata); /** * ib_destroy_cq - Destroys the specified kernel CQ. * @cq: The CQ to destroy. * * NOTE: for user cq use ib_destroy_cq_user with valid udata! */ static inline void ib_destroy_cq(struct ib_cq *cq) { int ret = ib_destroy_cq_user(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); } /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->ops.poll_cq(cq, num_entries, wc); } /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->ops.req_notify_cq(cq, flags); } struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, int comp_vector_hint, enum ib_poll_context poll_ctx); void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe); /* * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to * NULL. This causes the ib_dma* helpers to just stash the kernel virtual * address into the dma address. */ static inline bool ib_uses_virt_dma(struct ib_device *dev) { return IS_ENABLED(CONFIG_INFINIBAND_VIRT_DMA) && !dev->dma_device; } /* * Check if a IB device's underlying DMA mapping supports P2PDMA transfers. */ static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return false; return dma_pci_p2pdma_supported(dev->dma_device); } /** * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after * going through the dma_addr marshalling. */ static inline void *ib_virt_dma_to_ptr(u64 dma_addr) { /* virt_dma mode maps the kvs's directly into the dma addr */ return (void *)(uintptr_t)dma_addr; } /** * ib_virt_dma_to_page - Convert a dma_addr to a struct page * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() device to get back to the struct page after going * through the dma_addr marshalling. */ static inline struct page *ib_virt_dma_to_page(u64 dma_addr) { return virt_to_page(ib_virt_dma_to_ptr(dma_addr)); } /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (ib_uses_virt_dma(dev)) return 0; return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)cpu_addr; return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_single(dev->dma_device, addr, size, direction); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)(page_address(page) + offset); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_page(dev->dma_device, addr, size, direction); } int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (ib_uses_virt_dma(dev)) return ib_dma_virt_map_sg(dev, sg, nents); return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } /** * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The sg_table object describing the buffer * @direction: The direction of the DMA * @attrs: Optional DMA attributes for the map operation */ static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { int nents; if (ib_uses_virt_dma(dev)) { nents = ib_dma_virt_map_sg(dev, sgt->sgl, sgt->orig_nents); if (!nents) return -EIO; sgt->nents = nents; return 0; } return dma_map_sgtable(dev->dma_device, sgt, direction, dma_attrs); } static inline void ib_dma_unmap_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { return ib_dma_map_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { ib_dma_unmap_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_max_seg_size - Return the size limit of a single DMA transfer * @dev: The device to query * * The returned value represents a size in bytes. */ static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return UINT_MAX; return dma_get_max_seg_size(dev->dma_device); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /* ib_reg_user_mr - register a memory region for virtual addresses from kernel * space. This function should be called when 'current' is the owning MM. */ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags); /* ib_advise_mr - give an advice about an address range in a memory region */ int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge); /** * ib_dereg_mr_user - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * @udata: Valid user data or NULL for kernel object * * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata); /** * ib_dereg_mr - Deregisters a kernel memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * * This function can fail, if the memory region has memory windows bound to it. * * NOTE: for user mr use ib_dereg_mr_user with valid udata! */ static inline int ib_dereg_mr(struct ib_mr *mr) { return ib_dereg_mr_user(mr, NULL); } struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr - struct ib_mr pointer to be updated. * @newkey - new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey - the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return ((rkey + 1) & mask) | (rkey & ~mask); } /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, struct inode *inode, struct ib_udata *udata); int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); static inline int ib_check_mr_access(struct ib_device *ib_dev, unsigned int flags) { u64 device_cap = ib_dev->attrs.device_cap_flags; /* * Local write permission is required if remote write or * remote atomic permission is also requested. */ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && !(flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; if (flags & ~IB_ACCESS_SUPPORTED) return -EINVAL; if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EOPNOTSUPP; if ((flags & IB_ACCESS_FLUSH_GLOBAL && !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) || (flags & IB_ACCESS_FLUSH_PERSISTENT && !(device_cap & IB_DEVICE_FLUSH_PERSISTENT))) return -EOPNOTSUPP; return 0; } static inline bool ib_access_writable(int access_flags) { /* * We have writable memory backing the MR if any of the following * access flags are set. "Local write" and "remote write" obviously * require write access. "Remote atomic" can do things like fetch and * add, which will modify memory, and "MW bind" can change permissions * by binding a window. */ return access_flags & (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND); } /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected * ib_mr. first use is for signature status check. * * @mr: A memory region. * @check_mask: Bitmask of which checks to perform from * ib_mr_status_check enumeration. * @mr_status: The container of relevant status checks. * failed checks will be indicated in the status bitmask * and the relevant info shall be in the error item. */ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); /** * ib_device_try_get: Hold a registration lock * device: The device to lock * * A device under an active registration lock cannot become unregistered. It * is only possible to obtain a registration lock on a device that is fully * registered, otherwise this function returns false. * * The registration lock is only necessary for actions which require the * device to still be registered. Uses that only require the device pointer to * be valid should use get_device(&ibdev->dev) to hold the memory. * */ static inline bool ib_device_try_get(struct ib_device *dev) { return refcount_inc_not_zero(&dev->refcount); } void ib_device_put(struct ib_device *device); struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port); static inline enum ib_port_state ib_get_curr_port_state(struct net_device *net_dev) { return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; } void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset, unsigned int page_size); static inline int ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { int n; n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); mr->iova = 0; return n; } int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width); static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_ROCE) return attr->roce.dmac; return NULL; } static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.dlid = (u16)dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.dlid = dlid; } static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.dlid; return 0; } static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl) { attr->sl = sl; } static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr) { return attr->sl; } static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr, u8 src_path_bits) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.src_path_bits = src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.src_path_bits = src_path_bits; } static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.src_path_bits; return 0; } static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr, bool make_grd) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.make_grd = make_grd; } static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.make_grd; return false; } static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num) { attr->port_num = port_num; } static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr) { return attr->port_num; } static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr, u8 static_rate) { attr->static_rate = static_rate; } static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr) { return attr->static_rate; } static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr, enum ib_ah_flags flag) { attr->ah_flags = flag; } static inline enum ib_ah_flags rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr) { return attr->ah_flags; } static inline const struct ib_global_route *rdma_ah_read_grh(const struct rdma_ah_attr *attr) { return &attr->grh; } /*To retrieve and modify the grh */ static inline struct ib_global_route *rdma_ah_retrieve_grh(struct rdma_ah_attr *attr) { return &attr->grh; } static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid)); } static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr, __be64 prefix) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.subnet_prefix = prefix; } static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr, __be64 if_id) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.interface_id = if_id; } static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 sgid_index, u8 hop_limit, u8 traffic_class) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); attr->ah_flags = IB_AH_GRH; if (dgid) grh->dgid = *dgid; grh->flow_label = flow_label; grh->sgid_index = sgid_index; grh->hop_limit = hop_limit; grh->traffic_class = traffic_class; grh->sgid_attr = NULL; } void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr); void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 hop_limit, u8 traffic_class, const struct ib_gid_attr *sgid_attr); void rdma_copy_ah_attr(struct rdma_ah_attr *dest, const struct rdma_ah_attr *src); void rdma_replace_ah_attr(struct rdma_ah_attr *old, const struct rdma_ah_attr *new); void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src); /** * rdma_ah_find_type - Return address handle type. * * @dev: Device to be checked * @port_num: Port number */ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, u32 port_num) { if (rdma_protocol_roce(dev, port_num)) return RDMA_AH_ATTR_TYPE_ROCE; if (rdma_protocol_ib(dev, port_num)) { if (rdma_cap_opa_ah(dev, port_num)) return RDMA_AH_ATTR_TYPE_OPA; return RDMA_AH_ATTR_TYPE_IB; } if (dev->type == RDMA_DEVICE_TYPE_SMI) return RDMA_AH_ATTR_TYPE_IB; return RDMA_AH_ATTR_TYPE_UNDEFINED; } /** * ib_lid_cpu16 - Return lid in 16bit CPU encoding. * In the current implementation the only way to * get the 32bit lid is from other sources for OPA. * For IB, lids will always be 16bits so cast the * value accordingly. * * @lid: A 32bit LID */ static inline u16 ib_lid_cpu16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return (u16)lid; } /** * ib_lid_be16 - Return lid in 16bit BE encoding. * * @lid: A 32bit LID */ static inline __be16 ib_lid_be16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return cpu_to_be16((u16)lid); } /** * ib_get_vector_affinity - Get the affinity mappings of a given completion * vector * @device: the rdma device * @comp_vector: index of completion vector * * Returns NULL on failure, otherwise a corresponding cpu map of the * completion vector (returns all-cpus map if the device driver doesn't * implement get_vector_affinity). */ static inline const struct cpumask * ib_get_vector_affinity(struct ib_device *device, int comp_vector) { if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || !device->ops.get_vector_affinity) return NULL; return device->ops.get_vector_affinity(device, comp_vector); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @device: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port); void roce_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); #else static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) { return 0; } #endif struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), struct net_device *netdev); /** * rdma_device_to_ibdev - Get ib_device pointer from device pointer * * @device: device pointer for which ib_device pointer to retrieve * * rdma_device_to_ibdev() retrieves ib_device pointer from device. * */ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) { struct ib_core_device *coredev = container_of(device, struct ib_core_device, dev); return coredev->owner; } /** * ibdev_to_node - return the NUMA node for a given ib_device * @dev: device to get the NUMA node for. */ static inline int ibdev_to_node(struct ib_device *ibdev) { struct device *parent = ibdev->dev.parent; if (!parent) return NUMA_NO_NODE; return dev_to_node(parent); } /** * rdma_device_to_drv_device - Helper macro to reach back to driver's * ib_device holder structure from device pointer. * * NOTE: New drivers should not make use of this API; This API is only for * existing drivers who have exposed sysfs entries using * ops->device_group. */ #define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \ container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); #define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) #define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF) #define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) /** * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based * on the flow_label * * This function will convert the 20 bit flow_label input to a valid RoCE v2 * UDP src port 14 bit value. All RoCE V2 drivers should use this same * convention. */ static inline u16 rdma_flow_label_to_udp_sport(u32 fl) { u32 fl_low = fl & 0x03fff, fl_high = fl & 0xFC000; fl_low ^= fl_high >> 14; return (u16)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN); } /** * rdma_calc_flow_label - generate a RDMA symmetric flow label value based on * local and remote qpn values * * This function folded the multiplication results of two qpns, 24 bit each, * fields, and converts it to a 20 bit results. * * This function will create symmetric flow_label value based on the local * and remote qpn values. this will allow both the requester and responder * to calculate the same flow_label for a given connection. * * This helper function should be used by driver in case the upper layer * provide a zero flow_label value. This is to improve entropy of RDMA * traffic in the network. */ static inline u32 rdma_calc_flow_label(u32 lqpn, u32 rqpn) { u64 v = (u64)lqpn * rqpn; v ^= v >> 20; v ^= v >> 40; return (u32)(v & IB_GRH_FLOWLABEL_MASK); } /** * rdma_get_udp_sport - Calculate and set UDP source port based on the flow * label. If flow label is not defined in GRH then * calculate it based on lqpn/rqpn. * * @fl: flow label from GRH * @lqpn: local qp number * @rqpn: remote qp number */ static inline u16 rdma_get_udp_sport(u32 fl, u32 lqpn, u32 rqpn) { if (!fl) fl = rdma_calc_flow_label(lqpn, rqpn); return rdma_flow_label_to_udp_sport(fl); } const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port); /** ib_add_sub_device - Add a sub IB device on an existing one * * @parent: The IB device that needs to add a sub device * @type: The type of the new sub device * @name: The name of the new sub device * * * Return 0 on success, an error code otherwise */ int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); /** ib_del_sub_device_and_put - Delect an IB sub device while holding a 'get' * * @sub: The sub device that is going to be deleted * * Return 0 on success, an error code otherwise */ int ib_del_sub_device_and_put(struct ib_device *sub); static inline void ib_mark_name_assigned_by_user(struct ib_device *ibdev) { ibdev->name_assign_type = RDMA_NAME_ASSIGN_TYPE_USER; } #endif /* IB_VERBS_H */
36 36 36 36 36 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _LINUX_RCUREF_H #define _LINUX_RCUREF_H #include <linux/atomic.h> #include <linux/bug.h> #include <linux/limits.h> #include <linux/lockdep.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #define RCUREF_ONEREF 0x00000000U #define RCUREF_MAXREF 0x7FFFFFFFU #define RCUREF_SATURATED 0xA0000000U #define RCUREF_RELEASED 0xC0000000U #define RCUREF_DEAD 0xE0000000U #define RCUREF_NOREF 0xFFFFFFFFU /** * rcuref_init - Initialize a rcuref reference count with the given reference count * @ref: Pointer to the reference count * @cnt: The initial reference count typically '1' */ static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) { atomic_set(&ref->refcnt, cnt - 1); } /** * rcuref_read - Read the number of held reference counts of a rcuref * @ref: Pointer to the reference count * * Return: The number of held references (0 ... N). The value 0 does not * indicate that it is safe to schedule the object, protected by this reference * counter, for deconstruction. * If you want to know if the reference counter has been marked DEAD (as * signaled by rcuref_put()) please use rcuread_is_dead(). */ static inline unsigned int rcuref_read(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); /* Return 0 if within the DEAD zone. */ return c >= RCUREF_RELEASED ? 0 : c + 1; } /** * rcuref_is_dead - Check if the rcuref has been already marked dead * @ref: Pointer to the reference count * * Return: True if the object has been marked DEAD. This signals that a previous * invocation of rcuref_put() returned true on this reference counter meaning * the protected object can safely be scheduled for deconstruction. * Otherwise, returns false. */ static inline bool rcuref_is_dead(rcuref_t *ref) { unsigned int c = atomic_read(&ref->refcnt); return (c >= RCUREF_RELEASED) && (c < RCUREF_NOREF); } extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); /** * rcuref_get - Acquire one reference on a rcuref reference count * @ref: Pointer to the reference count * * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See documentation in lib/rcuref.c * * Return: * False if the attempt to acquire a reference failed. This happens * when the last reference has been put already * * True if a reference was successfully acquired */ static inline __must_check bool rcuref_get(rcuref_t *ref) { /* * Unconditionally increase the reference count. The saturation and * dead zones provide enough tolerance for this. */ if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) return true; /* Handle the cases inside the saturation and dead zones */ return rcuref_get_slowpath(ref); } extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { int cnt; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ cnt = atomic_sub_return_release(1, &ref->refcnt); if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ return rcuref_put_slowpath(ref, cnt); } /** * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe * @ref: Pointer to the reference count * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Can be invoked from contexts, which guarantee that no grace period can * happen which would free the object concurrently if the decrement drops * the last reference and the slowpath races against a concurrent get() and * put() pair. rcu_read_lock()'ed and atomic contexts qualify. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely release the * object which is protected by the reference counter. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * release the protected object. */ static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) { return __rcuref_put(ref); } /** * rcuref_put -- Release one reference for a rcuref reference count * @ref: Pointer to the reference count * * Can be invoked from any context. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides an acquire ordering on success such that free() * must come after. * * Return: * * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ static inline __must_check bool rcuref_put(rcuref_t *ref) { bool released; preempt_disable(); released = __rcuref_put(ref); preempt_enable(); return released; } #endif
82 82 206 206 206 200 24 206 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/capability.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/tty.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/cpu.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/freezer.h> #include <linux/binfmts.h> #include <linux/nsproxy.h> #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/kthread.h> #include <linux/mempolicy.h> #include <linux/taskstats_kern.h> #include <linux/delayacct.h> #include <linux/cgroup.h> #include <linux/syscalls.h> #include <linux/signal.h> #include <linux/posix-timers.h> #include <linux/cn_proc.h> #include <linux/mutex.h> #include <linux/futex.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> #include <trace/events/sched.h> #include <linux/hw_breakpoint.h> #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> #include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> #include <linux/sysfs.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <linux/pidfs.h> #include <uapi/linux/wait.h> #include <asm/unistd.h> #include <asm/mmu_context.h> #include "exit.h" /* * The default value should be high enough to not crash a system that randomly * crashes its kernel from time to time, but low enough to at least not permit * overflowing 32-bit refcounts or the ldsem writer count. */ static unsigned int oops_limit = 10000; #ifdef CONFIG_SYSCTL static const struct ctl_table kern_exit_table[] = { { .procname = "oops_limit", .data = &oops_limit, .maxlen = sizeof(oops_limit), .mode = 0644, .proc_handler = proc_douintvec, }, }; static __init int kernel_exit_sysctls_init(void) { register_sysctl_init("kernel", kern_exit_table); return 0; } late_initcall(kernel_exit_sysctls_init); #endif static atomic_t oops_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); } static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); static __init int kernel_exit_sysfs_init(void) { sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); return 0; } late_initcall(kernel_exit_sysfs_init); #endif /* * For things release_task() would like to do *after* tasklist_lock is released. */ struct release_task_post { struct pid *pids[PIDTYPE_MAX]; }; static void __unhash_process(struct release_task_post *post, struct task_struct *p, bool group_dead) { struct pid *pid = task_pid(p); nr_threads--; detach_pid(post->pids, p, PIDTYPE_PID); wake_up_all(&pid->wait_pidfd); if (group_dead) { detach_pid(post->pids, p, PIDTYPE_TGID); detach_pid(post->pids, p, PIDTYPE_PGID); detach_pid(post->pids, p, PIDTYPE_SID); list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); } list_del_rcu(&p->thread_node); } /* * This function expects the tasklist_lock write-locked. */ static void __exit_signal(struct release_task_post *post, struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; struct tty_struct *tty; u64 utime, stime; sighand = rcu_dereference_check(tsk->sighand, lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); #ifdef CONFIG_POSIX_TIMERS posix_cpu_timers_exit(tsk); if (group_dead) posix_cpu_timers_exit_group(tsk); #endif if (group_dead) { tty = sig->tty; sig->tty = NULL; } else { /* * If there is any task waiting for the group exit * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); } /* * Accumulate here the counters for all threads as they die. We could * skip the group leader because it is the last user of signal_struct, * but we want to avoid the race with thread_group_cputime() which can * see the empty ->thread_head list. */ task_cputime(tsk, &utime, &stime); write_seqlock(&sig->stats_lock); sig->utime += utime; sig->stime += stime; sig->gtime += task_gtime(tsk); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(post, tsk, group_dead); write_sequnlock(&sig->stats_lock); tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); if (group_dead) tty_kref_put(tty); } static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } void put_task_struct_rcu_user(struct task_struct *task) { if (refcount_dec_and_test(&task->rcu_users)) call_rcu(&task->rcu, delayed_put_task_struct); } void __weak release_thread(struct task_struct *dead_task) { } void release_task(struct task_struct *p) { struct release_task_post post; struct task_struct *leader; struct pid *thread_pid; int zap_leader; repeat: memset(&post, 0, sizeof(post)); /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); pidfs_exit(p); cgroup_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); __exit_signal(&post, p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the * group leader's parent process. (if it wants notification.) */ zap_leader = 0; leader = p->group_leader; if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { /* for pidfs_exit() and do_notify_parent() */ if (leader->signal->flags & SIGNAL_GROUP_EXIT) leader->exit_code = leader->signal->group_exit_code; /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, * then we are the one who should release the leader. */ zap_leader = do_notify_parent(leader, leader->exit_signal); if (zap_leader) leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); release_thread(p); /* * This task was already removed from the process/thread/pid lists * and lock_task_sighand(p) can't succeed. Nobody else can touch * ->pending or, if group dead, signal->shared_pending. We can call * flush_sigqueue() lockless. */ flush_sigqueue(&p->pending); if (thread_group_leader(p)) flush_sigqueue(&p->signal->shared_pending); put_task_struct_rcu_user(p); p = leader; if (unlikely(zap_leader)) goto repeat; } int rcuwait_wake_up(struct rcuwait *w) { int ret = 0; struct task_struct *task; rcu_read_lock(); /* * Order condition vs @task, such that everything prior to the load * of @task is visible. This is the condition as to why the user called * rcuwait_wake() in the first place. Pairs with set_current_state() * barrier (A) in rcuwait_wait_event(). * * WAIT WAKE * [S] tsk = current [S] cond = true * MB (A) MB (B) * [L] cond [L] tsk */ smp_mb(); /* (B) */ task = rcu_dereference(w->task); if (task) ret = wake_up_process(task); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(rcuwait_wake_up); /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are * to receive a SIGHUP and a SIGCONT. * * "I ask you, have you ever known what it is to be an orphan?" */ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if ((p == ignored_task) || (p->exit_state && thread_group_empty(p)) || is_global_init(p->real_parent)) continue; if (task_pgrp(p->real_parent) != pgrp && task_session(p->real_parent) == task_session(p)) return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return 1; } int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; } static bool has_stopped_jobs(struct pid *pgrp) { struct task_struct *p; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->signal->flags & SIGNAL_STOP_STOPPED) return true; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return false; } /* * Check to see if any process groups have become orphaned as * a result of our exiting, and if they have any stopped jobs, * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) { struct pid *pgrp = task_pgrp(tsk); struct task_struct *ignored_task = tsk; if (!parent) /* exit: our father is in a different pgrp than * we are and we were the only connection outside. */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than * we are, and it was the only connection outside. */ ignored_task = NULL; if (task_pgrp(parent) != pgrp && task_session(parent) == task_session(tsk) && will_become_orphaned_pgrp(pgrp, ignored_task) && has_stopped_jobs(pgrp)) { __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } static void coredump_task_exit(struct task_struct *tsk, struct core_state *core_state) { struct core_thread self; self.task = tsk; if (self.task->flags & PF_SIGNALED) self.next = xchg(&core_state->dumper.next, &self); else self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. */ if (atomic_dec_and_test(&core_state->nr_threads)) complete(&core_state->startup); for (;;) { set_current_state(TASK_IDLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; schedule(); } __set_current_state(TASK_RUNNING); } #ifdef CONFIG_MEMCG /* drops tasklist_lock if succeeds */ static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm) { bool ret = false; task_lock(tsk); if (likely(tsk->mm == mm)) { /* tsk can't pass exit_mm/exec_mmap and exit */ read_unlock(&tasklist_lock); WRITE_ONCE(mm->owner, tsk); lru_gen_migrate_mm(mm); ret = true; } task_unlock(tsk); return ret; } static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm) { struct task_struct *t; for_each_thread(g, t) { struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm == mm) { if (__try_to_set_owner(t, mm)) return true; } else if (t_mm) break; } return false; } /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ void mm_update_next_owner(struct mm_struct *mm) { struct task_struct *g, *p = current; /* * If the exiting or execing task is not the owner, it's * someone else's problem. */ if (mm->owner != p) return; /* * The current owner is exiting/execing and there are no other * candidates. Do not leave the mm pointing to a possibly * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { WRITE_ONCE(mm->owner, NULL); return; } read_lock(&tasklist_lock); /* * Search in the children */ list_for_each_entry(g, &p->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search in the siblings */ list_for_each_entry(g, &p->real_parent->children, sibling) { if (try_to_set_owner(g, mm)) goto ret; } /* * Search through everything else, we should not get here often. */ for_each_process(g) { if (atomic_read(&mm->mm_users) <= 1) break; if (g->flags & PF_KTHREAD) continue; if (try_to_set_owner(g, mm)) goto ret; } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ WRITE_ONCE(mm->owner, NULL); ret: return; } #endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ static void exit_mm(void) { struct mm_struct *mm = current->mm; exit_mm_release(current, mm); if (!mm) return; mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); /* * When a thread stops operating on an address space, the loop * in membarrier_private_expedited() may not observe that * tsk->mm, and the loop in membarrier_global_expedited() may * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED * rq->membarrier_state, so those would not issue an IPI. * Membarrier requires a memory barrier after accessing * user-space memory, before clearing tsk->mm or the * rq->membarrier_state. */ smp_mb__after_spinlock(); local_irq_disable(); current->mm = NULL; membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); local_irq_enable(); task_unlock(current); mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) { struct task_struct *t; for_each_thread(p, t) { if (!(t->flags & PF_EXITING)) return t; } return NULL; } static struct task_struct *find_child_reaper(struct task_struct *father, struct list_head *dead) __releases(&tasklist_lock) __acquires(&tasklist_lock) { struct pid_namespace *pid_ns = task_active_pid_ns(father); struct task_struct *reaper = pid_ns->child_reaper; struct task_struct *p, *n; if (likely(reaper != father)) return reaper; reaper = find_alive_thread(father); if (reaper) { pid_ns->child_reaper = reaper; return reaper; } write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); return father; } /* * When we die, we re-parent all our children, and try to: * 1. give them to another thread in our thread group, if such a member exists * 2. give it to the first ancestor process which prctl'd itself as a * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father, struct task_struct *child_reaper) { struct task_struct *thread, *reaper; thread = find_alive_thread(father); if (thread) return thread; if (father->signal->has_child_subreaper) { unsigned int ns_level = task_pid(father)->level; /* * Find the first ->is_child_subreaper ancestor in our pid_ns. * We can't check reaper != child_reaper to ensure we do not * cross the namespaces, the exiting parent could be injected * by setns() + fork(). * We check pid->level, this is slightly more efficient than * task_active_pid_ns(reaper) != task_active_pid_ns(father). */ for (reaper = father->real_parent; task_pid(reaper)->level == ns_level; reaper = reaper->real_parent) { if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; thread = find_alive_thread(reaper); if (thread) return thread; } } return child_reaper; } /* * Any that need to be release_task'd are put on the @dead list. */ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { if (unlikely(p->exit_state == EXIT_DEAD)) return; /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } /* * This does two things: * * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) { struct task_struct *p, *t, *reaper; if (unlikely(!list_empty(&father->ptraced))) exit_ptrace(father, dead); /* Can drop and reacquire tasklist_lock */ reaper = find_child_reaper(father, dead); if (list_empty(&father->children)) return; reaper = find_new_reaper(father, reaper); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { RCU_INIT_POINTER(t->real_parent, reaper); BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father)); if (likely(!t->ptrace)) t->parent = t->real_parent; if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t, PIDTYPE_TGID); } /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. */ if (!same_thread_group(reaper, father)) reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); } /* * Send signals to all our closest relatives so that they know * to properly mourn us.. */ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; struct task_struct *p, *n; LIST_HEAD(dead); write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); tsk->exit_state = EXIT_ZOMBIE; if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && !ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD; autoreap = do_notify_parent(tsk, sig); } else if (thread_group_leader(tsk)) { autoreap = thread_group_empty(tsk) && do_notify_parent(tsk, tsk->exit_signal); } else { autoreap = true; /* untraced sub-thread */ do_notify_pidfd(tsk); } if (autoreap) { tsk->exit_state = EXIT_DEAD; list_add(&tsk->ptrace_entry, &dead); } /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } } #ifdef CONFIG_DEBUG_STACK_USAGE unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ # ifdef CONFIG_STACK_GROWSUP n--; # else n++; # endif } while (!*n); # ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; # else return (unsigned long)n - (unsigned long)end_of_stack(p); # endif } /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) { #ifdef CONFIG_VM_EVENT_COUNTERS if (used_stack <= 1024) count_vm_event(KSTACK_1K); #if THREAD_SIZE > 1024 else if (used_stack <= 2048) count_vm_event(KSTACK_2K); #endif #if THREAD_SIZE > 2048 else if (used_stack <= 4096) count_vm_event(KSTACK_4K); #endif #if THREAD_SIZE > 4096 else if (used_stack <= 8192) count_vm_event(KSTACK_8K); #endif #if THREAD_SIZE > 8192 else if (used_stack <= 16384) count_vm_event(KSTACK_16K); #endif #if THREAD_SIZE > 16384 else if (used_stack <= 32768) count_vm_event(KSTACK_32K); #endif #if THREAD_SIZE > 32768 else if (used_stack <= 65536) count_vm_event(KSTACK_64K); #endif #if THREAD_SIZE > 65536 else count_vm_event(KSTACK_REST); #endif #endif /* CONFIG_VM_EVENT_COUNTERS */ } static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; unsigned long free; free = stack_not_used(current); kstack_histogram(THREAD_SIZE - free); if (free >= lowest_to_date) return; spin_lock(&low_water_lock); if (free < lowest_to_date) { pr_info("%s (%d) used greatest stack depth: %lu bytes left\n", current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); } #else static inline void check_stack_usage(void) {} #endif static void synchronize_group_exit(struct task_struct *tsk, long code) { struct sighand_struct *sighand = tsk->sighand; struct signal_struct *signal = tsk->signal; struct core_state *core_state; spin_lock_irq(&sighand->siglock); signal->quick_threads--; if ((signal->quick_threads == 0) && !(signal->flags & SIGNAL_GROUP_EXIT)) { signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = code; signal->group_stop_count = 0; } /* * Serialize with any possible pending coredump. * We must hold siglock around checking core_state * and setting PF_POSTCOREDUMP. The core-inducing thread * will increment ->nr_threads for each thread in the * group without PF_POSTCOREDUMP set. */ tsk->flags |= PF_POSTCOREDUMP; core_state = signal->core_state; spin_unlock_irq(&sighand->siglock); if (unlikely(core_state)) coredump_task_exit(tsk, core_state); } void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; WARN_ON(irqs_disabled()); WARN_ON(tsk->plug); kcov_task_exit(tsk); kmsan_task_exit(tsk); synchronize_group_exit(tsk, code); ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ seccomp_filter_release(tsk); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { /* * If the last thread of global init has exited, panic * immediately to get a useable coredump. */ if (unlikely(is_global_init(tsk))) panic("Attempted to kill init! exitcode=0x%08x\n", tsk->signal->group_exit_code ?: (int)code); #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) tty_audit_exit(); audit_free(tsk); tsk->exit_code = code; taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); /* * Since sampling can touch ->mm, make sure to stop everything before we * tear it down. * * Also flushes inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); exit_mm(); if (group_dead) acct_process(); exit_sem(tsk); exit_shm(tsk); exit_files(tsk); exit_fs(tsk); if (group_dead) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); sched_autogroup_exit_task(tsk); cgroup_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); #endif /* * Make sure we are holding no locks: */ debug_check_no_locks_held(); if (tsk->io_context) exit_io_context(tsk); if (tsk->splice_pipe) free_pipe_info(tsk->splice_pipe); if (tsk->task_frag.page) put_page(tsk->task_frag.page); exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); exit_tasks_rcu_finish(); lockdep_free_task(tsk); do_task_dead(); } void __noreturn make_task_dead(int signr) { /* * Take the task off the cpu after something catastrophic has * happened. * * We can get here from a kernel oops, sometimes with preemption off. * Start by checking for critical errors. * Then fix up important state like USER_DS and preemption. * Then do everything else. */ struct task_struct *tsk = current; unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); if (unlikely(irqs_disabled())) { pr_info("note: %s[%d] exited with irqs disabled\n", current->comm, task_pid_nr(current)); local_irq_enable(); } if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); preempt_count_set(PREEMPT_ENABLED); } /* * Every time the system oopses, if the oops happens while a reference * to an object was held, the reference leaks. * If the oops doesn't also leak memory, repeated oopsing can cause * reference counters to wrap around (if they're not using refcount_t). * This means that repeated oopsing can make unexploitable-looking bugs * exploitable through repeated oopsing. * To make sure this can't happen, place an upper bound on how often the * kernel may oops without panic(). */ limit = READ_ONCE(oops_limit); if (atomic_inc_return(&oops_count) >= limit && limit) panic("Oopsed too often (kernel.oops_limit is %d)", limit); /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); tsk->exit_state = EXIT_DEAD; refcount_inc(&tsk->rcu_users); do_task_dead(); } do_exit(signr); } SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); } /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; zap_other_threads(current); } spin_unlock_irq(&sighand->siglock); } do_exit(exit_code); /* NOTREACHED */ } /* * this kills every thread in the thread group. Note that any externally * wait4()-ing process will get the correct exit code - even if this * thread is not the thread group leader. */ SYSCALL_DEFINE1(exit_group, int, error_code) { do_group_exit((error_code & 0xff) << 8); /* NOTREACHED */ return 0; } static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { return wo->wo_type == PIDTYPE_MAX || task_pid_type(p, wo->wo_type) == wo->wo_pid; } static int eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; /* * Wait for all children (clone and not) if __WALL is set or * if it is traced by us. */ if (ptrace || (wo->wo_flags & __WALL)) return 1; /* * Otherwise, wait for clone children *only* if __WCLONE is set; * otherwise, wait for non-clone children *only*. * * Note: a "clone" child here is one that reports to its parent * using a signal other than SIGCHLD, or a non-leader thread which * we can only see if it is traced by us. */ if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; } /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { int state, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct waitid_info *infop; if (!likely(wo->wo_flags & WEXITED)) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); goto out_info; } /* * Move the task's state to DEAD/TRACE, only one thread can do this. */ state = (ptrace_reparented(p) && thread_group_leader(p)) ? EXIT_TRACE : EXIT_DEAD; if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; /* * We own this thread, nobody else can reap it. */ read_unlock(&tasklist_lock); sched_annotate_sleep(); /* * Check thread_group_leader() to exclude the traced sub-threads. */ if (state == EXIT_DEAD && thread_group_leader(p)) { struct signal_struct *sig = p->signal; struct signal_struct *psig = current->signal; unsigned long maxrss; u64 tgutime, tgstime; /* * The resource counters for the group leader are in its * own task_struct. Those for dead threads in the group * are in its signal_struct, as are those for the child * processes it has previously reaped. All these * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these * p->signal fields because the whole thread group is dead * and nobody can change them. * * psig->stats_lock also protects us from our sub-threads * which can reap other children at the same time. * * We use thread_group_cputime_adjusted() to get times for * the thread group, which consolidates times for all threads * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); write_seqlock_irq(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += p->maj_flt + sig->maj_flt + sig->cmaj_flt; psig->cnvcsw += p->nvcsw + sig->nvcsw + sig->cnvcsw; psig->cnivcsw += p->nivcsw + sig->nivcsw + sig->cnivcsw; psig->cinblock += task_io_get_inblock(p) + sig->inblock + sig->cinblock; psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; maxrss = max(sig->maxrss, sig->cmaxrss); if (psig->cmaxrss < maxrss) psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); write_sequnlock_irq(&psig->stats_lock); } if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; wo->wo_stat = status; if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; write_unlock_irq(&tasklist_lock); } if (state == EXIT_DEAD) release_task(p); out_info: infop = wo->wo_info; if (infop) { if ((status & 0x7f) == 0) { infop->cause = CLD_EXITED; infop->status = status >> 8; } else { infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; infop->status = status & 0x7f; } infop->pid = pid; infop->uid = uid; } return pid; } static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) return &p->signal->group_exit_code; } return NULL; } /** * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED * @wo: wait options * @ptrace: is the wait for ptrace * @p: task to wait for * * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: * read_lock(&tasklist_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on * success, implies that tasklist_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) { struct waitid_info *infop; int exit_code, *p_code, why; uid_t uid = 0; /* unneeded, required by compiler */ pid_t pid; /* * Traditionally we see ptrace'd stopped tasks regardless of options. */ if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; if (!task_stopped_code(p, ptrace)) return 0; exit_code = 0; spin_lock_irq(&p->sighand->siglock); p_code = task_stopped_code(p, ptrace); if (unlikely(!p_code)) goto unlock_sig; exit_code = *p_code; if (!exit_code) goto unlock_sig; if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; uid = from_kuid_munged(current_user_ns(), task_uid(p)); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) return 0; /* * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to * keep holding onto the tasklist_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); if (likely(!(wo->wo_flags & WNOWAIT))) wo->wo_stat = (exit_code << 8) | 0x7f; infop = wo->wo_info; if (infop) { infop->cause = why; infop->status = exit_code; infop->pid = pid; infop->uid = uid; } return pid; } /* * Handle do_wait work for one task in a live, non-stopped state. * read_lock(&tasklist_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) { struct waitid_info *infop; pid_t pid; uid_t uid; if (!unlikely(wo->wo_flags & WCONTINUED)) return 0; if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) return 0; spin_lock_irq(&p->sighand->siglock); /* Re-check with the lock held. */ if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { spin_unlock_irq(&p->sighand->siglock); return 0; } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; uid = from_kuid_munged(current_user_ns(), task_uid(p)); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); if (wo->wo_rusage) getrusage(p, RUSAGE_BOTH, wo->wo_rusage); put_task_struct(p); infop = wo->wo_info; if (!infop) { wo->wo_stat = 0xffff; } else { infop->cause = CLD_CONTINUED; infop->pid = pid; infop->uid = uid; infop->status = SIGCONT; } return pid; } /* * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or still -ECHILD. */ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { /* * We can race with wait_task_zombie() from another thread. * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition * can't confuse the checks below. */ int exit_state = READ_ONCE(p->exit_state); int ret; if (unlikely(exit_state == EXIT_DEAD)) return 0; ret = eligible_child(wo, ptrace, p); if (!ret) return ret; if (unlikely(exit_state == EXIT_TRACE)) { /* * ptrace == 0 means we are the natural parent. In this case * we should clear notask_error, debugger will notify us. */ if (likely(!ptrace)) wo->notask_error = 0; return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { /* * If it is traced by its real parent's group, just pretend * the caller is ptrace_do_wait() and reap this child if it * is zombie. * * This also hides group stop state from real parent; otherwise * a single stop can be reported twice as group and ptrace stop. * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. */ if (!ptrace_reparented(p)) ptrace = 1; } /* slay zombie? */ if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ if (!delay_group_leader(p)) { /* * A zombie ptracee is only visible to its ptracer. * Notification and reaping will be cascaded to the * real parent when the ptracer detaches. */ if (unlikely(ptrace) || likely(!p->ptrace)) return wait_task_zombie(wo, p); } /* * Allow access to stopped/continued state via zombie by * falling through. Clearing of notask_error is complex. * * When !@ptrace: * * If WEXITED is set, notask_error should naturally be * cleared. If not, subset of WSTOPPED|WCONTINUED is set, * so, if there are live subthreads, there are events to * wait for. If all subthreads are dead, it's still safe * to clear - this function will be called again in finite * amount time once all the subthreads are released and * will then return without clearing. * * When @ptrace: * * Stopped state is per-task and thus can't change once the * target task dies. Only continued and exited can happen. * Clear notask_error if WCONTINUED | WEXITED. */ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) wo->notask_error = 0; } else { /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ wo->notask_error = 0; } /* * Wait for stopped. Depending on @ptrace, different stopped state * is used and the two don't interact with each other. */ ret = wait_task_stopped(wo, ptrace, p); if (ret) return ret; /* * Wait for continued. There's only one continued state and the * ptracer can consume it which can confuse the real parent. Don't * use WCONTINUED from ptracer. You don't need or want it. */ return wait_task_continued(wo, p); } /* * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. * Returns nonzero for a final return, when we have unlocked tasklist_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or still -ECHILD. */ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } return 0; } static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) { struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } return 0; } bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p) { if (!eligible_pid(wo, p)) return false; if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent) return false; return true; } static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait); struct task_struct *p = key; if (pid_child_should_wake(wo, p)) return default_wake_function(wait, mode, sync, key); return 0; } void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, TASK_INTERRUPTIBLE, p); } static bool is_effectively_child(struct wait_opts *wo, bool ptrace, struct task_struct *target) { struct task_struct *parent = !ptrace ? target->real_parent : target->parent; return current == parent || (!(wo->wo_flags & __WNOTHREAD) && same_thread_group(current, parent)); } /* * Optimization for waiting on PIDTYPE_PID. No need to iterate through child * and tracee lists to find the target task. */ static int do_wait_pid(struct wait_opts *wo) { bool ptrace; struct task_struct *target; int retval; ptrace = false; target = pid_task(wo->wo_pid, PIDTYPE_TGID); if (target && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } ptrace = true; target = pid_task(wo->wo_pid, PIDTYPE_PID); if (target && target->ptrace && is_effectively_child(wo, ptrace, target)) { retval = wait_consider_task(wo, ptrace, target); if (retval) return retval; } return 0; } long __do_wait(struct wait_opts *wo) { long retval; /* * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet. */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; read_lock(&tasklist_lock); if (wo->wo_type == PIDTYPE_PID) { retval = do_wait_pid(wo); if (retval) return retval; } else { struct task_struct *tsk = current; do { retval = do_wait_thread(wo, tsk); if (retval) return retval; retval = ptrace_do_wait(wo, tsk); if (retval) return retval; if (wo->wo_flags & __WNOTHREAD) break; } while_each_thread(current, tsk); } read_unlock(&tasklist_lock); notask: retval = wo->notask_error; if (!retval && !(wo->wo_flags & WNOHANG)) return -ERESTARTSYS; return retval; } static long do_wait(struct wait_opts *wo) { int retval; trace_sched_process_wait(wo->wo_pid); init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); wo->child_wait.private = current; add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); do { set_current_state(TASK_INTERRUPTIBLE); retval = __do_wait(wo); if (retval != -ERESTARTSYS) break; if (signal_pending(current)) break; schedule(); } while (1); __set_current_state(TASK_RUNNING); remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); return retval; } int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { unsigned int f_flags = 0; struct pid *pid = NULL; enum pid_type type; if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) return -EINVAL; switch (which) { case P_ALL: type = PIDTYPE_MAX; break; case P_PID: type = PIDTYPE_PID; if (upid <= 0) return -EINVAL; pid = find_get_pid(upid); break; case P_PGID: type = PIDTYPE_PGID; if (upid < 0) return -EINVAL; if (upid) pid = find_get_pid(upid); else pid = get_task_pid(current, PIDTYPE_PGID); break; case P_PIDFD: type = PIDTYPE_PID; if (upid < 0) return -EINVAL; pid = pidfd_get_pid(upid, &f_flags); if (IS_ERR(pid)) return PTR_ERR(pid); break; default: return -EINVAL; } wo->wo_type = type; wo->wo_pid = pid; wo->wo_flags = options; wo->wo_info = infop; wo->wo_rusage = ru; if (f_flags & O_NONBLOCK) wo->wo_flags |= WNOHANG; return 0; } static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop, int options, struct rusage *ru) { struct wait_opts wo; long ret; ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru); if (ret) return ret; ret = do_wait(&wo); if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG)) ret = -EAGAIN; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, infop, int, options, struct rusage __user *, ru) { struct rusage r; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } long kernel_wait4(pid_t upid, int __user *stat_addr, int options, struct rusage *ru) { struct wait_opts wo; struct pid *pid = NULL; enum pid_type type; long ret; if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| __WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; /* -INT_MIN is not defined */ if (upid == INT_MIN) return -ESRCH; if (upid == -1) type = PIDTYPE_MAX; else if (upid < 0) { type = PIDTYPE_PGID; pid = find_get_pid(-upid); } else if (upid == 0) { type = PIDTYPE_PGID; pid = get_task_pid(current, PIDTYPE_PGID); } else /* upid > 0 */ { type = PIDTYPE_PID; pid = find_get_pid(upid); } wo.wo_type = type; wo.wo_pid = pid; wo.wo_flags = options | WEXITED; wo.wo_info = NULL; wo.wo_stat = 0; wo.wo_rusage = ru; ret = do_wait(&wo); put_pid(pid); if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr)) ret = -EFAULT; return ret; } int kernel_wait(pid_t pid, int *stat) { struct wait_opts wo = { .wo_type = PIDTYPE_PID, .wo_pid = find_get_pid(pid), .wo_flags = WEXITED, }; int ret; ret = do_wait(&wo); if (ret > 0 && wo.wo_stat) *stat = wo.wo_stat; put_pid(wo.wo_pid); return ret; } SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, int, options, struct rusage __user *, ru) { struct rusage r; long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && copy_to_user(ru, &r, sizeof(struct rusage))) return -EFAULT; } return err; } #ifdef __ARCH_WANT_SYS_WAITPID /* * sys_waitpid() remains for compatibility. waitpid() should be * implemented by calling sys_wait4() from libc.a. */ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) { return kernel_wait4(pid, stat_addr, options, NULL); } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(wait4, compat_pid_t, pid, compat_uint_t __user *, stat_addr, int, options, struct compat_rusage __user *, ru) { struct rusage r; long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL); if (err > 0) { if (ru && put_compat_rusage(&r, ru)) return -EFAULT; } return err; } COMPAT_SYSCALL_DEFINE5(waitid, int, which, compat_pid_t, pid, struct compat_siginfo __user *, infop, int, options, struct compat_rusage __user *, uru) { struct rusage ru; struct waitid_info info = {.status = 0}; long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL); int signo = 0; if (err > 0) { signo = SIGCHLD; err = 0; if (uru) { /* kernel_waitid() overwrites everything in ru */ if (COMPAT_USE_64BIT_TIME) err = copy_to_user(uru, &ru, sizeof(ru)); else err = put_compat_rusage(&ru, uru); if (err) return -EFAULT; } } if (!infop) return err; if (!user_write_access_begin(infop, sizeof(*infop))) return -EFAULT; unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); user_write_access_end(); return err; Efault: user_write_access_end(); return -EFAULT; } #endif /* * This needs to be __function_aligned as GCC implicitly makes any * implementation of abort() cold and drops alignment specified by * -falign-functions=N. * * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11 */ __weak __function_aligned void abort(void) { BUG(); /* if that doesn't kill us, halt */ panic("Oops failed to kill thread"); } EXPORT_SYMBOL(abort);
78 77 178 88 3 1 79 78 93 93 94 89 81 1 73 80 82 82 93 195 2 2 2 186 178 96 79 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/kvm_host.h> #include <asm/kvm_emulate.h> #include <trace/events/kvm.h> #include "trace.h" void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) { void *datap = NULL; union { u8 byte; u16 hword; u32 word; u64 dword; } tmp; switch (len) { case 1: tmp.byte = data; datap = &tmp.byte; break; case 2: tmp.hword = data; datap = &tmp.hword; break; case 4: tmp.word = data; datap = &tmp.word; break; case 8: tmp.dword = data; datap = &tmp.dword; break; } memcpy(buf, datap, len); } unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) { unsigned long data = 0; union { u16 hword; u32 word; u64 dword; } tmp; switch (len) { case 1: data = *(u8 *)buf; break; case 2: memcpy(&tmp.hword, buf, len); data = tmp.hword; break; case 4: memcpy(&tmp.word, buf, len); data = tmp.word; break; case 8: memcpy(&tmp.dword, buf, len); data = tmp.dword; break; } return data; } static bool kvm_pending_external_abort(struct kvm_vcpu *vcpu) { if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION)) return false; if (vcpu_el1_is_32bit(vcpu)) { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA32_UND): case unpack_vcpu_flag(EXCEPT_AA32_IABT): case unpack_vcpu_flag(EXCEPT_AA32_DABT): return true; default: return false; } } else { switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC): case unpack_vcpu_flag(EXCEPT_AA64_EL1_SERR): case unpack_vcpu_flag(EXCEPT_AA64_EL2_SERR): return true; default: return false; } } } /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation * or in-kernel IO emulation * * @vcpu: The VCPU pointer */ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) { unsigned long data; unsigned int len; int mask; /* * Detect if the MMIO return was already handled or if userspace aborted * the MMIO access. */ if (unlikely(!vcpu->mmio_needed || kvm_pending_external_abort(vcpu))) return 1; vcpu->mmio_needed = 0; if (!kvm_vcpu_dabt_iswrite(vcpu)) { struct kvm_run *run = vcpu->run; len = kvm_vcpu_dabt_get_as(vcpu); data = kvm_mmio_read_buf(run->mmio.data, len); if (kvm_vcpu_dabt_issext(vcpu) && len < sizeof(unsigned long)) { mask = 1U << ((len * 8) - 1); data = (data ^ mask) - mask; } if (!kvm_vcpu_dabt_issf(vcpu)) data = data & 0xffffffff; trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, &data); data = vcpu_data_host_to_guest(vcpu, data, len); vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data); } /* * The MMIO instruction is emulated and should not be re-executed * in the guest. */ kvm_incr_pc(vcpu); return 1; } int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { struct kvm_run *run = vcpu->run; unsigned long data; unsigned long rt; int ret; bool is_write; int len; u8 data_buf[8]; /* * No valid syndrome? Ask userspace for help if it has * volunteered to do so, and bail out otherwise. * * In the protected VM case, there isn't much userspace can do * though, so directly deliver an exception to the guest. */ if (!kvm_vcpu_dabt_isvalid(vcpu)) { trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu), kvm_vcpu_get_hfar(vcpu), fault_ipa); if (vcpu_is_protected(vcpu)) return kvm_inject_sea_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &vcpu->kvm->arch.flags)) { run->exit_reason = KVM_EXIT_ARM_NISV; run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu); run->arm_nisv.fault_ipa = fault_ipa; return 0; } return -ENOSYS; } /* * Prepare MMIO operation. First decode the syndrome data we get * from the CPU. Then try if some in-kernel emulation feels * responsible, otherwise let user space do its magic. */ is_write = kvm_vcpu_dabt_iswrite(vcpu); len = kvm_vcpu_dabt_get_as(vcpu); rt = kvm_vcpu_dabt_get_rd(vcpu); if (is_write) { data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), len); trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } else { trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, fault_ipa, NULL); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } /* Now prepare kvm_run for the potential return to userland. */ run->mmio.is_write = is_write; run->mmio.phys_addr = fault_ipa; run->mmio.len = len; vcpu->mmio_needed = 1; if (!ret) { /* We handled the access successfully in the kernel. */ if (!is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_kernel++; kvm_handle_mmio_return(vcpu); return 1; } if (is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_user++; run->exit_reason = KVM_EXIT_MMIO; return 0; }
2 1 1 1 1 1 1 1 3 2 3 2 8 1 7 3 1 2 1 1 1 4 4 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 // SPDX-License-Identifier: GPL-2.0-only /* * VFIO-KVM bridge pseudo device * * Copyright (C) 2013 Red Hat, Inc. All rights reserved. * Author: Alex Williamson <alex.williamson@redhat.com> */ #include <linux/errno.h> #include <linux/file.h> #include <linux/kvm_host.h> #include <linux/list.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vfio.h> #include "vfio.h" #ifdef CONFIG_SPAPR_TCE_IOMMU #include <asm/kvm_ppc.h> #endif struct kvm_vfio_file { struct list_head node; struct file *file; #ifdef CONFIG_SPAPR_TCE_IOMMU struct iommu_group *iommu_group; #endif }; struct kvm_vfio { struct list_head file_list; struct mutex lock; bool noncoherent; }; static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm) { void (*fn)(struct file *file, struct kvm *kvm); fn = symbol_get(vfio_file_set_kvm); if (!fn) return; fn(file, kvm); symbol_put(vfio_file_set_kvm); } static bool kvm_vfio_file_enforced_coherent(struct file *file) { bool (*fn)(struct file *file); bool ret; fn = symbol_get(vfio_file_enforced_coherent); if (!fn) return false; ret = fn(file); symbol_put(vfio_file_enforced_coherent); return ret; } static bool kvm_vfio_file_is_valid(struct file *file) { bool (*fn)(struct file *file); bool ret; fn = symbol_get(vfio_file_is_valid); if (!fn) return false; ret = fn(file); symbol_put(vfio_file_is_valid); return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { struct iommu_group *(*fn)(struct file *file); struct iommu_group *ret; fn = symbol_get(vfio_file_iommu_group); if (!fn) return NULL; ret = fn(file); symbol_put(vfio_file_iommu_group); return ret; } static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_file *kvf) { if (WARN_ON_ONCE(!kvf->iommu_group)) return; kvm_spapr_tce_release_iommu_group(kvm, kvf->iommu_group); iommu_group_put(kvf->iommu_group); kvf->iommu_group = NULL; } #endif /* * Groups/devices can use the same or different IOMMU domains. If the same * then adding a new group/device may change the coherency of groups/devices * we've previously been told about. We don't want to care about any of * that so we retest each group/device and bail as soon as we find one that's * noncoherent. This means we only ever [un]register_noncoherent_dma once * for the whole device. */ static void kvm_vfio_update_coherency(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; bool noncoherent = false; struct kvm_vfio_file *kvf; list_for_each_entry(kvf, &kv->file_list, node) { if (!kvm_vfio_file_enforced_coherent(kvf->file)) { noncoherent = true; break; } } if (noncoherent != kv->noncoherent) { kv->noncoherent = noncoherent; if (kv->noncoherent) kvm_arch_register_noncoherent_dma(dev->kvm); else kvm_arch_unregister_noncoherent_dma(dev->kvm); } } static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; struct file *filp; int ret = 0; filp = fget(fd); if (!filp) return -EBADF; /* Ensure the FD is a vfio FD. */ if (!kvm_vfio_file_is_valid(filp)) { ret = -EINVAL; goto out_fput; } mutex_lock(&kv->lock); list_for_each_entry(kvf, &kv->file_list, node) { if (kvf->file == filp) { ret = -EEXIST; goto out_unlock; } } kvf = kzalloc(sizeof(*kvf), GFP_KERNEL_ACCOUNT); if (!kvf) { ret = -ENOMEM; goto out_unlock; } kvf->file = get_file(filp); list_add_tail(&kvf->node, &kv->file_list); kvm_arch_start_assignment(dev->kvm); kvm_vfio_file_set_kvm(kvf->file, dev->kvm); kvm_vfio_update_coherency(dev); out_unlock: mutex_unlock(&kv->lock); out_fput: fput(filp); return ret; } static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; CLASS(fd, f)(fd); int ret; if (fd_empty(f)) return -EBADF; ret = -ENOENT; mutex_lock(&kv->lock); list_for_each_entry(kvf, &kv->file_list, node) { if (kvf->file != fd_file(f)) continue; list_del(&kvf->node); kvm_arch_end_assignment(dev->kvm); #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif kvm_vfio_file_set_kvm(kvf->file, NULL); fput(kvf->file); kfree(kvf); ret = 0; break; } kvm_vfio_update_coherency(dev); mutex_unlock(&kv->lock); return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev, void __user *arg) { struct kvm_vfio_spapr_tce param; struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf; int ret; if (copy_from_user(&param, arg, sizeof(struct kvm_vfio_spapr_tce))) return -EFAULT; CLASS(fd, f)(param.groupfd); if (fd_empty(f)) return -EBADF; ret = -ENOENT; mutex_lock(&kv->lock); list_for_each_entry(kvf, &kv->file_list, node) { if (kvf->file != fd_file(f)) continue; if (!kvf->iommu_group) { kvf->iommu_group = kvm_vfio_file_iommu_group(kvf->file); if (WARN_ON_ONCE(!kvf->iommu_group)) { ret = -EIO; goto err_fdput; } } ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, kvf->iommu_group); break; } err_fdput: mutex_unlock(&kv->lock); return ret; } #endif static int kvm_vfio_set_file(struct kvm_device *dev, long attr, void __user *arg) { int32_t __user *argp = arg; int32_t fd; switch (attr) { case KVM_DEV_VFIO_FILE_ADD: if (get_user(fd, argp)) return -EFAULT; return kvm_vfio_file_add(dev, fd); case KVM_DEV_VFIO_FILE_DEL: if (get_user(fd, argp)) return -EFAULT; return kvm_vfio_file_del(dev, fd); #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: return kvm_vfio_file_set_spapr_tce(dev, arg); #endif } return -ENXIO; } static int kvm_vfio_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_VFIO_FILE: return kvm_vfio_set_file(dev, attr->attr, u64_to_user_ptr(attr->addr)); } return -ENXIO; } static int kvm_vfio_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_VFIO_FILE: switch (attr->attr) { case KVM_DEV_VFIO_FILE_ADD: case KVM_DEV_VFIO_FILE_DEL: #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: #endif return 0; } break; } return -ENXIO; } static void kvm_vfio_release(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; struct kvm_vfio_file *kvf, *tmp; list_for_each_entry_safe(kvf, tmp, &kv->file_list, node) { #ifdef CONFIG_SPAPR_TCE_IOMMU kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif kvm_vfio_file_set_kvm(kvf->file, NULL); fput(kvf->file); list_del(&kvf->node); kfree(kvf); kvm_arch_end_assignment(dev->kvm); } kvm_vfio_update_coherency(dev); kfree(kv); kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */ } static int kvm_vfio_create(struct kvm_device *dev, u32 type); static const struct kvm_device_ops kvm_vfio_ops = { .name = "kvm-vfio", .create = kvm_vfio_create, .release = kvm_vfio_release, .set_attr = kvm_vfio_set_attr, .has_attr = kvm_vfio_has_attr, }; static int kvm_vfio_create(struct kvm_device *dev, u32 type) { struct kvm_device *tmp; struct kvm_vfio *kv; lockdep_assert_held(&dev->kvm->lock); /* Only one VFIO "device" per VM */ list_for_each_entry(tmp, &dev->kvm->devices, vm_node) if (tmp->ops == &kvm_vfio_ops) return -EBUSY; kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT); if (!kv) return -ENOMEM; INIT_LIST_HEAD(&kv->file_list); mutex_init(&kv->lock); dev->private = kv; return 0; } int kvm_vfio_ops_init(void) { return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); } void kvm_vfio_ops_exit(void) { kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); }
9 167 167 166 167 167 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Implementation of the access vector table type. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* Updated: Frank Mayer <mayerf@tresys.com> and * Karl MacMillan <kmacmillan@tresys.com> * Added conditional policy language extensions * Copyright (C) 2003 Tresys Technology, LLC * * Updated: Yuichi Nakamura <ynakam@hitachisoft.jp> * Tuned number of hash slots for avtab to reduce memory usage */ #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include "avtab.h" #include "policydb.h" static struct kmem_cache *avtab_node_cachep __ro_after_init; static struct kmem_cache *avtab_xperms_cachep __ro_after_init; /* Based on MurmurHash3, written by Austin Appleby and placed in the * public domain. */ static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask) { static const u32 c1 = 0xcc9e2d51; static const u32 c2 = 0x1b873593; static const u32 r1 = 15; static const u32 r2 = 13; static const u32 m = 5; static const u32 n = 0xe6546b64; u32 hash = 0; #define mix(input) \ do { \ u32 v = input; \ v *= c1; \ v = (v << r1) | (v >> (32 - r1)); \ v *= c2; \ hash ^= v; \ hash = (hash << r2) | (hash >> (32 - r2)); \ hash = hash * m + n; \ } while (0) mix(keyp->target_class); mix(keyp->target_type); mix(keyp->source_type); #undef mix hash ^= hash >> 16; hash *= 0x85ebca6b; hash ^= hash >> 13; hash *= 0xc2b2ae35; hash ^= hash >> 16; return hash & mask; } static struct avtab_node *avtab_insert_node(struct avtab *h, struct avtab_node **dst, const struct avtab_key *key, const struct avtab_datum *datum) { struct avtab_node *newnode; struct avtab_extended_perms *xperms; newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL); if (newnode == NULL) return NULL; newnode->key = *key; if (key->specified & AVTAB_XPERMS) { xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL); if (xperms == NULL) { kmem_cache_free(avtab_node_cachep, newnode); return NULL; } *xperms = *(datum->u.xperms); newnode->datum.u.xperms = xperms; } else { newnode->datum.u.data = datum->u.data; } newnode->next = *dst; *dst = newnode; h->nel++; return newnode; } static int avtab_node_cmp(const struct avtab_key *key1, const struct avtab_key *key2) { u16 specified = key1->specified & ~(AVTAB_ENABLED | AVTAB_ENABLED_OLD); if (key1->source_type == key2->source_type && key1->target_type == key2->target_type && key1->target_class == key2->target_class && (specified & key2->specified)) return 0; if (key1->source_type < key2->source_type) return -1; if (key1->source_type == key2->source_type && key1->target_type < key2->target_type) return -1; if (key1->source_type == key2->source_type && key1->target_type == key2->target_type && key1->target_class < key2->target_class) return -1; return 1; } static int avtab_insert(struct avtab *h, const struct avtab_key *key, const struct avtab_datum *datum) { u32 hvalue; struct avtab_node *prev, *cur, *newnode; int cmp; if (!h || !h->nslot || h->nel == U32_MAX) return -EINVAL; hvalue = avtab_hash(key, h->mask); for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { cmp = avtab_node_cmp(key, &cur->key); /* extended perms may not be unique */ if (cmp == 0 && !(key->specified & AVTAB_XPERMS)) return -EEXIST; if (cmp <= 0) break; } newnode = avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue], key, datum); if (!newnode) return -ENOMEM; return 0; } /* Unlike avtab_insert(), this function allow multiple insertions of the same * key/specified mask into the table, as needed by the conditional avtab. * It also returns a pointer to the node inserted. */ struct avtab_node *avtab_insert_nonunique(struct avtab *h, const struct avtab_key *key, const struct avtab_datum *datum) { u32 hvalue; struct avtab_node *prev, *cur; int cmp; if (!h || !h->nslot || h->nel == U32_MAX) return NULL; hvalue = avtab_hash(key, h->mask); for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { cmp = avtab_node_cmp(key, &cur->key); if (cmp <= 0) break; } return avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue], key, datum); } /* This search function returns a node pointer, and can be used in * conjunction with avtab_search_next_node() */ struct avtab_node *avtab_search_node(struct avtab *h, const struct avtab_key *key) { u32 hvalue; struct avtab_node *cur; int cmp; if (!h || !h->nslot) return NULL; hvalue = avtab_hash(key, h->mask); for (cur = h->htable[hvalue]; cur; cur = cur->next) { cmp = avtab_node_cmp(key, &cur->key); if (cmp == 0) return cur; if (cmp < 0) break; } return NULL; } struct avtab_node *avtab_search_node_next(struct avtab_node *node, u16 specified) { struct avtab_key tmp_key; struct avtab_node *cur; int cmp; if (!node) return NULL; tmp_key = node->key; tmp_key.specified = specified; for (cur = node->next; cur; cur = cur->next) { cmp = avtab_node_cmp(&tmp_key, &cur->key); if (cmp == 0) return cur; if (cmp < 0) break; } return NULL; } void avtab_destroy(struct avtab *h) { u32 i; struct avtab_node *cur, *temp; if (!h) return; for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; while (cur) { temp = cur; cur = cur->next; if (temp->key.specified & AVTAB_XPERMS) kmem_cache_free(avtab_xperms_cachep, temp->datum.u.xperms); kmem_cache_free(avtab_node_cachep, temp); } } kvfree(h->htable); h->htable = NULL; h->nel = 0; h->nslot = 0; h->mask = 0; } void avtab_init(struct avtab *h) { h->htable = NULL; h->nel = 0; h->nslot = 0; h->mask = 0; } static int avtab_alloc_common(struct avtab *h, u32 nslot) { if (!nslot) return 0; h->htable = kvcalloc(nslot, sizeof(void *), GFP_KERNEL); if (!h->htable) return -ENOMEM; h->nslot = nslot; h->mask = nslot - 1; return 0; } int avtab_alloc(struct avtab *h, u32 nrules) { int rc; u32 nslot = 0; if (nrules != 0) { nslot = nrules > 3 ? rounddown_pow_of_two(nrules / 2) : 2; if (nslot > MAX_AVTAB_HASH_BUCKETS) nslot = MAX_AVTAB_HASH_BUCKETS; rc = avtab_alloc_common(h, nslot); if (rc) return rc; } pr_debug("SELinux: %d avtab hash slots, %d rules.\n", nslot, nrules); return 0; } int avtab_alloc_dup(struct avtab *new, const struct avtab *orig) { return avtab_alloc_common(new, orig->nslot); } #ifdef CONFIG_SECURITY_SELINUX_DEBUG void avtab_hash_eval(struct avtab *h, const char *tag) { u32 i, chain_len, slots_used, max_chain_len; unsigned long long chain2_len_sum; struct avtab_node *cur; slots_used = 0; max_chain_len = 0; chain2_len_sum = 0; for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; if (cur) { slots_used++; chain_len = 0; while (cur) { chain_len++; cur = cur->next; } if (chain_len > max_chain_len) max_chain_len = chain_len; chain2_len_sum += (unsigned long long)chain_len * chain_len; } } pr_debug("SELinux: %s: %d entries and %d/%d buckets used, " "longest chain length %d, sum of chain length^2 %llu\n", tag, h->nel, slots_used, h->nslot, max_chain_len, chain2_len_sum); } #endif /* CONFIG_SECURITY_SELINUX_DEBUG */ /* clang-format off */ static const uint16_t spec_order[] = { AVTAB_ALLOWED, AVTAB_AUDITDENY, AVTAB_AUDITALLOW, AVTAB_TRANSITION, AVTAB_CHANGE, AVTAB_MEMBER, AVTAB_XPERMS_ALLOWED, AVTAB_XPERMS_AUDITALLOW, AVTAB_XPERMS_DONTAUDIT }; /* clang-format on */ int avtab_read_item(struct avtab *a, struct policy_file *fp, struct policydb *pol, int (*insertf)(struct avtab *a, const struct avtab_key *k, const struct avtab_datum *d, void *p), void *p, bool conditional) { __le16 buf16[4]; u16 enabled; u32 items, items2, val, i; struct avtab_key key; struct avtab_datum datum; struct avtab_extended_perms xperms; __le32 buf32[ARRAY_SIZE(xperms.perms.p)]; int rc; unsigned int set, vers = pol->policyvers; memset(&key, 0, sizeof(struct avtab_key)); memset(&datum, 0, sizeof(struct avtab_datum)); if (vers < POLICYDB_VERSION_AVTAB) { rc = next_entry(buf32, fp, sizeof(u32)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items2 = le32_to_cpu(buf32[0]); if (items2 > ARRAY_SIZE(buf32)) { pr_err("SELinux: avtab: entry overflow\n"); return -EINVAL; } rc = next_entry(buf32, fp, sizeof(u32) * items2); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items = 0; val = le32_to_cpu(buf32[items++]); key.source_type = (u16)val; if (key.source_type != val) { pr_err("SELinux: avtab: truncated source type\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); key.target_type = (u16)val; if (key.target_type != val) { pr_err("SELinux: avtab: truncated target type\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); key.target_class = (u16)val; if (key.target_class != val) { pr_err("SELinux: avtab: truncated target class\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); enabled = (val & AVTAB_ENABLED_OLD) ? AVTAB_ENABLED : 0; if (!(val & (AVTAB_AV | AVTAB_TYPE))) { pr_err("SELinux: avtab: null entry\n"); return -EINVAL; } if ((val & AVTAB_AV) && (val & AVTAB_TYPE)) { pr_err("SELinux: avtab: entry has both access vectors and types\n"); return -EINVAL; } if (val & AVTAB_XPERMS) { pr_err("SELinux: avtab: entry has extended permissions\n"); return -EINVAL; } for (i = 0; i < ARRAY_SIZE(spec_order); i++) { if (val & spec_order[i]) { key.specified = spec_order[i] | enabled; datum.u.data = le32_to_cpu(buf32[items++]); rc = insertf(a, &key, &datum, p); if (rc) return rc; } } if (items != items2) { pr_err("SELinux: avtab: entry only had %d items, expected %d\n", items2, items); return -EINVAL; } return 0; } rc = next_entry(buf16, fp, sizeof(u16) * 4); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items = 0; key.source_type = le16_to_cpu(buf16[items++]); key.target_type = le16_to_cpu(buf16[items++]); key.target_class = le16_to_cpu(buf16[items++]); key.specified = le16_to_cpu(buf16[items++]); if (!policydb_type_isvalid(pol, key.source_type) || !policydb_type_isvalid(pol, key.target_type) || !policydb_class_isvalid(pol, key.target_class)) { pr_err("SELinux: avtab: invalid type or class\n"); return -EINVAL; } set = hweight16(key.specified & (AVTAB_XPERMS | AVTAB_TYPE | AVTAB_AV)); if (!set || set > 1) { pr_err("SELinux: avtab: more than one specifier\n"); return -EINVAL; } if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) && (key.specified & AVTAB_XPERMS)) { pr_err("SELinux: avtab: policy version %u does not " "support extended permissions rules and one " "was specified\n", vers); return -EINVAL; } else if ((vers < POLICYDB_VERSION_COND_XPERMS) && (key.specified & AVTAB_XPERMS) && conditional) { pr_err("SELinux: avtab: policy version %u does not " "support extended permissions rules in conditional " "policies and one was specified\n", vers); return -EINVAL; } else if (key.specified & AVTAB_XPERMS) { memset(&xperms, 0, sizeof(struct avtab_extended_perms)); rc = next_entry(&xperms.specified, fp, sizeof(u8)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } rc = next_entry(&xperms.driver, fp, sizeof(u8)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } rc = next_entry(buf32, fp, sizeof(u32) * ARRAY_SIZE(xperms.perms.p)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++) xperms.perms.p[i] = le32_to_cpu(buf32[i]); datum.u.xperms = &xperms; } else { rc = next_entry(buf32, fp, sizeof(u32)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } datum.u.data = le32_to_cpu(*buf32); } if ((key.specified & AVTAB_TYPE) && !policydb_type_isvalid(pol, datum.u.data)) { pr_err("SELinux: avtab: invalid type\n"); return -EINVAL; } return insertf(a, &key, &datum, p); } static int avtab_insertf(struct avtab *a, const struct avtab_key *k, const struct avtab_datum *d, void *p) { return avtab_insert(a, k, d); } int avtab_read(struct avtab *a, struct policy_file *fp, struct policydb *pol) { int rc; __le32 buf[1]; u32 nel, i; rc = next_entry(buf, fp, sizeof(u32)); if (rc < 0) { pr_err("SELinux: avtab: truncated table\n"); goto bad; } nel = le32_to_cpu(buf[0]); if (!nel) { pr_err("SELinux: avtab: table is empty\n"); rc = -EINVAL; goto bad; } rc = avtab_alloc(a, nel); if (rc) goto bad; for (i = 0; i < nel; i++) { rc = avtab_read_item(a, fp, pol, avtab_insertf, NULL, false); if (rc) { if (rc == -ENOMEM) pr_err("SELinux: avtab: out of memory\n"); else if (rc == -EEXIST) pr_err("SELinux: avtab: duplicate entry\n"); goto bad; } } rc = 0; out: return rc; bad: avtab_destroy(a); goto out; } int avtab_write_item(struct policydb *p, const struct avtab_node *cur, struct policy_file *fp) { __le16 buf16[4]; __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)]; int rc; unsigned int i; buf16[0] = cpu_to_le16(cur->key.source_type); buf16[1] = cpu_to_le16(cur->key.target_type); buf16[2] = cpu_to_le16(cur->key.target_class); buf16[3] = cpu_to_le16(cur->key.specified); rc = put_entry(buf16, sizeof(u16), 4, fp); if (rc) return rc; if (cur->key.specified & AVTAB_XPERMS) { rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1, fp); if (rc) return rc; rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp); if (rc) return rc; for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++) buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]); rc = put_entry(buf32, sizeof(u32), ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp); } else { buf32[0] = cpu_to_le32(cur->datum.u.data); rc = put_entry(buf32, sizeof(u32), 1, fp); } if (rc) return rc; return 0; } int avtab_write(struct policydb *p, struct avtab *a, struct policy_file *fp) { u32 i; int rc = 0; struct avtab_node *cur; __le32 buf[1]; buf[0] = cpu_to_le32(a->nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < a->nslot; i++) { for (cur = a->htable[i]; cur; cur = cur->next) { rc = avtab_write_item(p, cur, fp); if (rc) return rc; } } return rc; } void __init avtab_cache_init(void) { avtab_node_cachep = KMEM_CACHE(avtab_node, SLAB_PANIC); avtab_xperms_cachep = KMEM_CACHE(avtab_extended_perms, SLAB_PANIC); }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 // SPDX-License-Identifier: GPL-2.0 /* Watch queue and general notification mechanism, built on pipes * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/core-api/watch_queue.rst */ #define pr_fmt(fmt) "watchq: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/miscdevice.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/poll.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/sched/signal.h> #include <linux/watch_queue.h> #include <linux/pipe_fs_i.h> MODULE_DESCRIPTION("Watch queue"); MODULE_AUTHOR("Red Hat, Inc."); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) /* * This must be called under the RCU read-lock, which makes * sure that the wqueue still exists. It can then take the lock, * and check that the wqueue hasn't been destroyed, which in * turn makes sure that the notification pipe still exists. */ static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); if (unlikely(!wqueue->pipe)) { spin_unlock_bh(&wqueue->lock); return false; } return true; } static inline void unlock_wqueue(struct watch_queue *wqueue) { spin_unlock_bh(&wqueue->lock); } static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct watch_queue *wqueue = (struct watch_queue *)buf->private; struct page *page; unsigned int bit; /* We need to work out which note within the page this refers to, but * the note might have been maximum size, so merely ANDing the offset * off doesn't work. OTOH, the note must've been more than zero size. */ bit = buf->offset + buf->len; if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) bit -= WATCH_QUEUE_NOTE_SIZE; bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; bit += page->private; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing #define watch_queue_pipe_buf_try_steal NULL /* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { .release = watch_queue_pipe_buf_release, .try_steal = watch_queue_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* * Post a notification to a watch queue. * * Must be called with the RCU lock for reading, and the * watch_queue lock held, which guarantees that the pipe * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) { void *p; struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; unsigned int head, tail, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) goto lost; note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); if (note >= wqueue->nr_notes) goto lost; page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; p = kmap_atomic(page); memcpy(p + offset, n, len); kunmap_atomic(p); buf = pipe_buf(pipe, head); buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); BUG(); } wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); done = true; out: spin_unlock_irq(&pipe->rd_wait.lock); if (done) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); return done; lost: buf = pipe_buf(pipe, head - 1); buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } /* * Apply filter rules to a notification. */ static bool filter_watch_notification(const struct watch_filter *wf, const struct watch_notification *n) { const struct watch_type_filter *wt; unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; unsigned int st_index = n->subtype / st_bits; unsigned int st_bit = 1U << (n->subtype % st_bits); int i; if (!test_bit(n->type, wf->type_filter)) return false; for (i = 0; i < wf->nr_filters; i++) { wt = &wf->filters[i]; if (n->type == wt->type && (wt->subtype_filter[st_index] & st_bit) && (n->info & wt->info_mask) == wt->info_filter) return true; } return false; /* If there is a filter, the default is to reject. */ } /** * __post_watch_notification - Post an event notification * @wlist: The watch list to post the event to. * @n: The notification record to post. * @cred: The creds of the process that triggered the notification. * @id: The ID to match on the watch. * * Post a notification of an event into a set of watch queues and let the users * know. * * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and * should be in units of sizeof(*n). */ void __post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { const struct watch_filter *wf; struct watch_queue *wqueue; struct watch *watch; if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { WARN_ON(1); return; } rcu_read_lock(); hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { if (watch->id != id) continue; n->info &= ~WATCH_INFO_ID; n->info |= watch->info_id; wqueue = rcu_dereference(watch->queue); wf = rcu_dereference(wqueue->filter); if (wf && !filter_watch_notification(wf, n)) continue; if (security_post_notification(watch->cred, cred, n) < 0) continue; if (lock_wqueue(wqueue)) { post_one_notification(wqueue, n); unlock_wqueue(wqueue); } } rcu_read_unlock(); } EXPORT_SYMBOL(__post_watch_notification); /* * Allocate sufficient pages to preallocation for the requested number of * notifications. */ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) { struct watch_queue *wqueue = pipe->watch_queue; struct page **pages; unsigned long *bitmap; unsigned long user_bufs; int ret, i, nr_pages; if (!wqueue) return -ENODEV; if (wqueue->notes) return -EBUSY; if (nr_notes < 1 || nr_notes > 512) /* TODO: choose a better hard limit */ return -EINVAL; nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); if (nr_pages > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto error; } nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; /* * pipe_resize_ring() does not update nr_accounted for watch_queue * pipes, because the above vastly overprovisions. Set nr_accounted on * and max_usage this pipe to the number that was actually charged to * the user above via account_pipe_buffers. */ pipe->max_usage = nr_pages; pipe->nr_accounted = nr_pages; ret = -ENOMEM; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto error; for (i = 0; i < nr_pages; i++) { pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; wqueue->nr_notes = nr_notes; return 0; error_p: while (--i >= 0) __free_page(pages[i]); kfree(pages); error: (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); return ret; } /* * Set the filter on a watch queue. */ long watch_queue_set_filter(struct pipe_inode_info *pipe, struct watch_notification_filter __user *_filter) { struct watch_notification_type_filter *tf; struct watch_notification_filter filter; struct watch_type_filter *q; struct watch_filter *wfilter; struct watch_queue *wqueue = pipe->watch_queue; int ret, nr_filter = 0, i; if (!wqueue) return -ENODEV; if (!_filter) { /* Remove the old filter */ wfilter = NULL; goto set; } /* Grab the user's filter specification */ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) return -EFAULT; if (filter.nr_filters == 0 || filter.nr_filters > 16 || filter.__reserved != 0) return -EINVAL; tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf)); if (IS_ERR(tf)) return PTR_ERR(tf); ret = -EINVAL; for (i = 0; i < filter.nr_filters; i++) { if ((tf[i].info_filter & ~tf[i].info_mask) || tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } /* Now we need to build the internal filter from only the relevant * user-specified filters. */ ret = -ENOMEM; wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); if (!wfilter) goto err_filter; wfilter->nr_filters = nr_filter; q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; q->info_filter = tf[i].info_filter; q->info_mask = tf[i].info_mask; q->subtype_filter[0] = tf[i].subtype_filter[0]; __set_bit(q->type, wfilter->type_filter); q++; } kfree(tf); set: pipe_lock(pipe); wfilter = rcu_replace_pointer(wqueue->filter, wfilter, lockdep_is_held(&pipe->mutex)); pipe_unlock(pipe); if (wfilter) kfree_rcu(wfilter, rcu); return 0; err_filter: kfree(tf); return ret; } static void __put_watch_queue(struct kref *kref) { struct watch_queue *wqueue = container_of(kref, struct watch_queue, usage); struct watch_filter *wfilter; int i; for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); kfree(wqueue->notes); bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) kfree_rcu(wfilter, rcu); kfree_rcu(wqueue, rcu); } /** * put_watch_queue - Dispose of a ref on a watchqueue. * @wqueue: The watch queue to unref. */ void put_watch_queue(struct watch_queue *wqueue) { kref_put(&wqueue->usage, __put_watch_queue); } EXPORT_SYMBOL(put_watch_queue); static void free_watch(struct rcu_head *rcu) { struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); kfree(watch); } static void __put_watch(struct kref *kref) { struct watch *watch = container_of(kref, struct watch, usage); call_rcu(&watch->rcu, free_watch); } /* * Discard a watch. */ static void put_watch(struct watch *watch) { kref_put(&watch->usage, __put_watch); } /** * init_watch - Initialise a watch * @watch: The watch to initialise. * @wqueue: The queue to assign. * * Initialise a watch and set the watch queue. */ void init_watch(struct watch *watch, struct watch_queue *wqueue) { kref_init(&watch->usage); INIT_HLIST_NODE(&watch->list_node); INIT_HLIST_NODE(&watch->queue_node); rcu_assign_pointer(watch->queue, wqueue); } static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue) { const struct cred *cred; struct watch *w; hlist_for_each_entry(w, &wlist->watchers, list_node) { struct watch_queue *wq = rcu_access_pointer(w->queue); if (wqueue == wq && watch->id == w->id) return -EBUSY; } cred = current_cred(); if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) { atomic_dec(&cred->user->nr_watches); return -EAGAIN; } watch->cred = get_cred(cred); rcu_assign_pointer(watch->watch_list, wlist); kref_get(&wqueue->usage); kref_get(&watch->usage); hlist_add_head(&watch->queue_node, &wqueue->watches); hlist_add_head_rcu(&watch->list_node, &wlist->watchers); return 0; } /** * add_watch_to_object - Add a watch on an object to a watch list * @watch: The watch to add * @wlist: The watch list to add to * * @watch->queue must have been set to point to the queue to post notifications * to and the watch list of the object to be watched. @watch->cred must also * have been set to the appropriate credentials and a ref taken on them. * * The caller must pin the queue and the list both and must hold the list * locked against racing watch additions/removals. */ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) { struct watch_queue *wqueue; int ret = -ENOENT; rcu_read_lock(); wqueue = rcu_access_pointer(watch->queue); if (lock_wqueue(wqueue)) { spin_lock(&wlist->lock); ret = add_one_watch(watch, wlist, wqueue); spin_unlock(&wlist->lock); unlock_wqueue(wqueue); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(add_watch_to_object); /** * remove_watch_from_object - Remove a watch or all watches from an object. * @wlist: The watch list to remove from * @wq: The watch queue of interest (ignored if @all is true) * @id: The ID of the watch to remove (ignored if @all is true) * @all: True to remove all objects * * Remove a specific watch or all watches from an object. A notification is * sent to the watcher to tell them that this happened. */ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, u64 id, bool all) { struct watch_notification_removal n; struct watch_queue *wqueue; struct watch *watch; int ret = -EBADSLT; rcu_read_lock(); again: spin_lock(&wlist->lock); hlist_for_each_entry(watch, &wlist->watchers, list_node) { if (all || (watch->id == id && rcu_access_pointer(watch->queue) == wq)) goto found; } spin_unlock(&wlist->lock); goto out; found: ret = 0; hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); spin_unlock(&wlist->lock); /* We now own the reference on watch that used to belong to wlist. */ n.watch.type = WATCH_TYPE_META; n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; n.watch.info = watch->info_id | watch_sizeof(n.watch); n.id = id; if (id != 0) n.watch.info = watch->info_id | watch_sizeof(n); wqueue = rcu_dereference(watch->queue); if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } unlock_wqueue(wqueue); } if (wlist->release_watch) { void (*release_watch)(struct watch *); release_watch = wlist->release_watch; rcu_read_unlock(); (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); if (all && !hlist_empty(&wlist->watchers)) goto again; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(remove_watch_from_object); /* * Remove all the watches that are contributory to a queue. This has the * potential to race with removal of the watches by the destruction of the * objects being watched or with the distribution of notifications. */ void watch_queue_clear(struct watch_queue *wqueue) { struct watch_list *wlist; struct watch *watch; bool release; rcu_read_lock(); spin_lock_bh(&wqueue->lock); /* * This pipe can be freed by callers like free_pipe_info(). * Removing this reference also prevents new notifications. */ wqueue->pipe = NULL; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); hlist_del_init_rcu(&watch->queue_node); /* We now own a ref on the watch. */ spin_unlock_bh(&wqueue->lock); /* We can't do the next bit under the queue lock as we need to * get the list lock - which would cause a deadlock if someone * was removing from the opposite direction at the same time or * posting a notification. */ wlist = rcu_dereference(watch->watch_list); if (wlist) { void (*release_watch)(struct watch *); spin_lock(&wlist->lock); release = !hlist_unhashed(&watch->list_node); if (release) { hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); /* We now own a second ref on the watch. */ } release_watch = wlist->release_watch; spin_unlock(&wlist->lock); if (release) { if (release_watch) { rcu_read_unlock(); /* This might need to call dput(), so * we have to drop all the locks. */ (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); } } put_watch(watch); spin_lock_bh(&wqueue->lock); } spin_unlock_bh(&wqueue->lock); rcu_read_unlock(); } /** * get_watch_queue - Get a watch queue from its file descriptor. * @fd: The fd to query. */ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); CLASS(fd, f)(fd); if (!fd_empty(f)) { pipe = get_pipe_info(fd_file(f), false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } } return wqueue; } EXPORT_SYMBOL(get_watch_queue); /* * Initialise a watch queue */ int watch_queue_init(struct pipe_inode_info *pipe) { struct watch_queue *wqueue; wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); if (!wqueue) return -ENOMEM; wqueue->pipe = pipe; kref_init(&wqueue->usage); spin_lock_init(&wqueue->lock); INIT_HLIST_HEAD(&wqueue->watches); pipe->watch_queue = wqueue; return 0; }
1 2 10 9 9 9 2 7 12 2 10 10 2 3 3 1 2 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 /* * llc_input.c - Minimal input path for LLC * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/llc_sap.h> #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) #endif /* * Packet handler for the station, registerable because in the minimal * LLC core that is taking shape only the very minimal subset of LLC that * is needed for things like IPX, Appletalk, etc will stay, with all the * rest in the llc1 and llc2 modules. */ static void (*llc_station_handler)(struct sk_buff *skb); /* * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. */ static void (*llc_type_handlers[2])(struct llc_sap *sap, struct sk_buff *skb); void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { /* Ensure initialisation is complete before it's called */ if (handler) smp_wmb(); llc_station_handler = handler; if (!handler) synchronize_net(); } /** * llc_pdu_type - returns which LLC component must handle for PDU * @skb: input skb * * This function returns which LLC component must handle this PDU. */ static __inline__ int llc_pdu_type(struct sk_buff *skb) { int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) goto out; switch (LLC_U_PDU_CMD(pdu)) { case LLC_1_PDU_CMD_XID: case LLC_1_PDU_CMD_UI: case LLC_1_PDU_CMD_TEST: type = LLC_DEST_SAP; break; case LLC_2_PDU_CMD_SABME: case LLC_2_PDU_CMD_DISC: case LLC_2_PDU_RSP_UA: case LLC_2_PDU_RSP_DM: case LLC_2_PDU_RSP_FRMR: break; default: type = LLC_DEST_INVALID; break; } out: return type; } /** * llc_fixup_skb - initializes skb pointers * @skb: This argument points to incoming skb * * Initializes internal skb pointer to start of network layer by deriving * length of LLC header; finds length of LLC control field in LLC header * by looking at the two lowest-order bits of the first control field * byte; field is either 3 or 4 bytes long. */ static inline int llc_fixup_skb(struct sk_buff *skb) { u8 llc_len = 2; struct llc_pdu_un *pdu; if (unlikely(!pskb_may_pull(skb, sizeof(*pdu)))) return 0; pdu = (struct llc_pdu_un *)skb->data; if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) llc_len = 1; llc_len += 2; if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; skb_pull(skb, llc_len); skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; if (skb->mac_len < ETH_HLEN) return 0; pdulen = eth_hdr(skb)->h_proto; data_size = ntohs(pdulen) - llc_len; if (data_size < 0 || !pskb_may_pull(skb, data_size)) return 0; if (unlikely(pskb_trim_rcsum(skb, data_size))) return 0; } return 1; } /** * llc_rcv - 802.2 entry point from net lower layers * @skb: received pdu * @dev: device that receive pdu * @pt: packet type * @orig_dev: the original receive net device * * When the system receives a 802.2 frame this function is called. It * checks SAP and connection of received pdu and passes frame to * llc_{station,sap,conn}_rcv for sending to proper state machine. If * the frame is related to a busy connection (a connection is sending * data now), it queues this frame in the connection's backlog. */ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { dprintk("%s: PACKET_OTHERHOST\n", __func__); goto drop; } skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; if (unlikely(!llc_fixup_skb(skb))) goto drop; pdu = llc_pdu_sn_hdr(skb); if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ goto handle_station; sap = llc_sap_find(pdu->dsap); if (unlikely(!sap)) {/* unknown SAP */ dprintk("%s: llc_sap_find(%02X) failed!\n", __func__, pdu->dsap); goto drop; } /* * First the upper layer protocols that don't need the full * LLC functionality */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL; if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else kfree_skb(skb); } else { if (rcv) { struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); } sap_handler(sap, skb); } llc_sap_put(sap); out: return 0; drop: kfree_skb(skb); goto out; handle_station: sta_handler = READ_ONCE(llc_station_handler); if (!sta_handler) goto drop; sta_handler(skb); goto out; } EXPORT_SYMBOL(llc_add_pack); EXPORT_SYMBOL(llc_remove_pack); EXPORT_SYMBOL(llc_set_station_handler);
64 20 26 26 19 8 1 19 26 26 20 2 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) { } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } static inline void zero_user(struct page *page, unsigned start, unsigned size) { zero_user_segments(page, start, start + size, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memcpy_folio(struct folio *dst_folio, size_t dst_off, struct folio *src_folio, size_t src_off, size_t len) { VM_BUG_ON(dst_off + len > folio_size(dst_folio)); VM_BUG_ON(src_off + len > folio_size(src_folio)); do { char *dst = kmap_local_folio(dst_folio, dst_off); const char *src = kmap_local_folio(src_folio, src_off); size_t chunk = len; if (folio_test_highmem(dst_folio) && chunk > PAGE_SIZE - offset_in_page(dst_off)) chunk = PAGE_SIZE - offset_in_page(dst_off); if (folio_test_highmem(src_folio) && chunk > PAGE_SIZE - offset_in_page(src_off)) chunk = PAGE_SIZE - offset_in_page(src_off); memcpy(dst, src, chunk); kunmap_local(src); kunmap_local(dst); dst_off += chunk; src_off += chunk; len -= chunk; } while (len > 0); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_partial_kmap(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_partial_kmap(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_partial_kmap(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } static inline void unmap_and_put_page(struct page *page, void *addr) { folio_release_kmap(page_folio(page), addr); } #endif /* _LINUX_HIGHMEM_H */
29 17 6 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/mm_types.h> #include <linux/ucopysize.h> #include <uapi/linux/uio.h> struct page; struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_UBUF, ITER_IOVEC, ITER_BVEC, ITER_KVEC, ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; #define ITER_SOURCE 1 // == WRITE #define ITER_DEST 0 // == READ struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool nofault; bool data_source; size_t iov_offset; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type * of iterator used. This means that you can use: * * &iter->__ubuf_iovec or iter->__iov * * interchangably for the user_backed cases, hence simplifying * some of the cases that need to deal with both. */ union { /* * This really should be a const, but we cannot do that without * also modifying any of the zero-filling iter init functions. * Leave it non-const for now, but it should be treated as such. */ struct iovec __ubuf_iovec; struct { union { /* use iter_iov() to get the current vec */ const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; size_t count; }; }; union { unsigned long nr_segs; u8 folioq_slot; loff_t xarray_start; }; }; typedef __u16 uio_meta_flags_t; struct uio_meta { uio_meta_flags_t flags; u16 app_tag; u64 seed; struct iov_iter iter; }; static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) return (const struct iovec *) &iter->__ubuf_iovec; return iter->__iov; } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) static inline size_t iter_iov_len(const struct iov_iter *i) { if (i->iter_type == ITER_UBUF) return i->count; return iter_iov(i)->iov_len - i->iov_offset; } static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_ubuf(const struct iov_iter *i) { return iov_iter_type(i) == ITER_UBUF; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_folioq(const struct iov_iter *i) { return iov_iter_type(i) == ITER_FOLIOQ; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } static inline bool user_backed_iter(const struct iov_iter *i) { return iter_is_ubuf(i) || iter_is_iovec(i); } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_to_iter(&folio->page, offset, bytes, i); } static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_from_iter(&folio->page, offset, bytes, i); } size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_to_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_mc_to_iter _copy_to_iter #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } static inline int iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) { size_t shorted = 0; int npages; if (iov_iter_count(i) > max_bytes) { shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); return npages; } struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .data_source = direction, .ubuf = buf, .count = count, .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ /* Allow P2PDMA on the extracted pages */ #define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained * @iter: The iterator * * Examine the iterator and indicate by returning true or false as to how, if * at all, pages extracted from the iterator will be retained by the extraction * function. * * %true indicates that the pages will have a pin placed in them that the * caller must unpin. This is must be done for DMA/async DIO to force fork() * to forcibly copy a page for the child (the parent must retain the original * page). * * %false indicates that no measures are taken and that it's up to the caller * to retain the pages. */ static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) { return user_backed_iter(iter); } struct sg_table; ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags); #endif
287 285 284 285 287 287 287 287 287 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2005 SGI, Christoph Lameter * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov * Copyright (C) 2016 Intel, Matthew Wilcox * Copyright (C) 2016 Intel, Ross Zwisler */ #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/percpu.h> #include <linux/preempt.h> /* in_interrupt() */ #include <linux/radix-tree.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/xarray.h> #include "radix-tree.h" /* * Radix tree node cache. */ struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has * to build the branch to its corresponding item, it also has to build the * branch to existing items if the size has to be increased (by * radix_tree_extend). * * The worst case is a zero height tree with just a single item at index 0, * and then inserting an item at index ULONG_MAX. This requires 2 new branches * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared. * Hence: */ #define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1) /* * The IDR does not have to be as high as the radix tree since it uses * signed integers, not unsigned longs. */ #define IDR_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(int) - 1) #define IDR_MAX_PATH (DIV_ROUND_UP(IDR_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) #define IDR_PRELOAD_SIZE (IDR_MAX_PATH * 2 - 1) /* * Per-cpu pool of preloaded nodes */ DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { .lock = INIT_LOCAL_LOCK(lock), }; EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads); static inline struct radix_tree_node *entry_to_node(void *ptr) { return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); } static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } #define RADIX_TREE_RETRY XA_RETRY_ENTRY static inline unsigned long get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) { return parent ? slot - parent->slots : 0; } static unsigned int radix_tree_descend(const struct radix_tree_node *parent, struct radix_tree_node **nodep, unsigned long index) { unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); *nodep = (void *)entry; return offset; } static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, int offset) { __set_bit(offset, node->tags[tag]); } static inline void tag_clear(struct radix_tree_node *node, unsigned int tag, int offset) { __clear_bit(offset, node->tags[tag]); } static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, int offset) { return test_bit(offset, node->tags[tag]); } static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) { root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1); } static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) { return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(const struct radix_tree_root *root) { return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; } static inline bool is_idr(const struct radix_tree_root *root) { return !!(root->xa_flags & ROOT_IS_IDR); } /* * Returns 1 if any slot in the node has this tag set. * Otherwise returns 0. */ static inline int any_tag_set(const struct radix_tree_node *node, unsigned int tag) { unsigned idx; for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) { if (node->tags[tag][idx]) return 1; } return 0; } static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag) { bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE); } /** * radix_tree_find_next_bit - find the next set bit in a memory region * * @node: where to begin the search * @tag: the tag index * @offset: the bitnumber to start searching at * * Unrollable variant of find_next_bit() for constant size arrays. * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero. * Returns next bit offset, or size if nothing found. */ static __always_inline unsigned long radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, unsigned long offset) { const unsigned long *addr = node->tags[tag]; if (offset < RADIX_TREE_MAP_SIZE) { unsigned long tmp; addr += offset / BITS_PER_LONG; tmp = *addr >> (offset % BITS_PER_LONG); if (tmp) return __ffs(tmp) + offset; offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); while (offset < RADIX_TREE_MAP_SIZE) { tmp = *++addr; if (tmp) return __ffs(tmp) + offset; offset += BITS_PER_LONG; } } return RADIX_TREE_MAP_SIZE; } static unsigned int iter_offset(const struct radix_tree_iter *iter) { return iter->index & RADIX_TREE_MAP_MASK; } /* * The maximum index which can be stored in a radix tree */ static inline unsigned long shift_maxindex(unsigned int shift) { return (RADIX_TREE_MAP_SIZE << shift) - 1; } static inline unsigned long node_maxindex(const struct radix_tree_node *node) { return shift_maxindex(node->shift); } static unsigned long next_index(unsigned long index, const struct radix_tree_node *node, unsigned long offset) { return (index & ~node_maxindex(node)) + (offset << node->shift); } /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. */ static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, struct radix_tree_root *root, unsigned int shift, unsigned int offset, unsigned int count, unsigned int nr_values) { struct radix_tree_node *ret = NULL; /* * Preload code isn't irq safe and it doesn't make sense to use * preloading during an interrupt anyway as all the allocations have * to be atomic. So just do normal allocation when in interrupt. */ if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) { struct radix_tree_preload *rtp; /* * Even if the caller has preloaded, try to allocate from the * cache first for the new node to get accounted to the memory * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask | __GFP_NOWARN); if (ret) goto out; /* * Provided the caller has preloaded here, we will always * succeed in getting a node here (and never reach * kmem_cache_alloc) */ rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes; rtp->nodes = ret->parent; rtp->nr--; } /* * Update the allocation stack trace as this is more useful * for debugging. */ kmemleak_update_trace(ret); goto out; } ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); if (ret) { ret->shift = shift; ret->offset = offset; ret->count = count; ret->nr_values = nr_values; ret->parent = parent; ret->array = root; } return ret; } void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); /* * Must only free zeroed nodes into the slab. We can be left with * non-NULL entries by radix_tree_free_nodes, so clear the entries * and tags here. */ memset(node->slots, 0, sizeof(node->slots)); memset(node->tags, 0, sizeof(node->tags)); INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } static inline void radix_tree_node_free(struct radix_tree_node *node) { call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; int ret = -ENOMEM; /* * Nodes preloaded by one cgroup can be used by another cgroup, so * they should never be accounted to any particular memory cgroup. */ gfp_mask &= ~__GFP_ACCOUNT; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { local_unlock(&radix_tree_preloads.lock); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; local_lock(&radix_tree_preloads.lock); rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < nr) { node->parent = rtp->nodes; rtp->nodes = node; rtp->nr++; } else { kmem_cache_free(radix_tree_node_cachep, node); } } ret = 0; out: return ret; } /* * Load up this CPU's radix_tree_node buffer with sufficient objects to * ensure that the addition of a single element in the tree cannot fail. On * success, return zero, with preemption disabled. On error, return -ENOMEM * with preemption not disabled. * * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); /* * The same as above function, except we don't guarantee preloading happens. * We do it, if we decide it helps. On success, return zero with preemption * disabled. On error, return -ENOMEM with preemption not disabled. */ int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ local_lock(&radix_tree_preloads.lock); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); *nodep = node; if (likely(radix_tree_is_internal_node(node))) { node = entry_to_node(node); *maxindex = node_maxindex(node); return node->shift + RADIX_TREE_MAP_SHIFT; } *maxindex = 0; return 0; } /* * Extend a radix tree so it can store key @index. */ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, unsigned long index, unsigned int shift) { void *entry; unsigned int maxshift; int tag; /* Figure out what the shift should be. */ maxshift = shift; while (index > shift_maxindex(maxshift)) maxshift += RADIX_TREE_MAP_SHIFT; entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) goto out; do { struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL, root, shift, 0, 1, 0); if (!node) return -ENOMEM; if (is_idr(root)) { all_tag_set(node, IDR_FREE); if (!root_tag_get(root, IDR_FREE)) { tag_clear(node, IDR_FREE, 0); root_tag_set(root, IDR_FREE); } } else { /* Propagate the aggregated tag info to the new child */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); } } BUG_ON(shift > BITS_PER_LONG); if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; } else if (xa_is_value(entry)) { /* Moving a value entry root->xa_head to a node */ node->nr_values = 1; } /* * entry was already in the radix tree, so we do not need * rcu_assign_pointer here */ node->slots[0] = (void __rcu *)entry; entry = node_to_entry(node); rcu_assign_pointer(root->xa_head, entry); shift += RADIX_TREE_MAP_SHIFT; } while (shift <= maxshift); out: return maxshift + RADIX_TREE_MAP_SHIFT; } /** * radix_tree_shrink - shrink radix tree to minimum height * @root: radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; for (;;) { struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); struct radix_tree_node *child; if (!radix_tree_is_internal_node(node)) break; node = entry_to_node(node); /* * The candidate node has more than one child, or its child * is not at the leftmost slot, we cannot shrink. */ if (node->count != 1) break; child = rcu_dereference_raw(node->slots[0]); if (!child) break; /* * For an IDR, we must not shrink entry 0 into the root in * case somebody calls idr_replace() with a pointer that * appears to be an internal entry */ if (!node->shift && is_idr(root)) break; if (radix_tree_is_internal_node(child)) entry_to_node(child)->parent = NULL; /* * We don't need rcu_assign_pointer(), since we are simply * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (node->slots[0]), it will be safe to dereference the new * one (root->xa_head) as far as dependent read barriers go. */ root->xa_head = (void __rcu *)child; if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) root_tag_clear(root, IDR_FREE); /* * We have a dilemma here. The node's slot[0] must not be * NULLed in case there are concurrent lookups expecting to * find the item. However if this was a bottom-level node, * then it may be subject to the slot pointer being visible * to callers dereferencing it. If item corresponding to * slot[0] is subsequently deleted, these callers would expect * their slot to become empty sooner or later. * * For example, lockless pagecache will look up a slot, deref * the page pointer, and if the page has 0 refcount it means it * was concurrently deleted from pagecache so try the deref * again. Fortunately there is already a requirement for logic * to retry the entire slot lookup -- the indirect pointer * problem (replacing direct root node with an indirect pointer * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ node->count = 0; if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); shrunk = true; } return shrunk; } static bool delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { bool deleted = false; do { struct radix_tree_node *parent; if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->xa_head)) deleted |= radix_tree_shrink(root); return deleted; } parent = node->parent; if (parent) { parent->slots[node->offset] = NULL; parent->count--; } else { /* * Shouldn't the tags already have all been cleared * by the caller? */ if (!is_idr(root)) root_tag_clear_all(root); root->xa_head = NULL; } WARN_ON_ONCE(!list_empty(&node->private_list)); radix_tree_node_free(node); deleted = true; node = parent; } while (node); return deleted; } /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Create, if necessary, and return the node and slot for an item * at position @index in the radix tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. * * Returns -ENOMEM, or 0 for success. */ static int __radix_tree_create(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; unsigned long max = index; gfp_t gfp = root_gfp_mask(root); shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift); if (error < 0) return error; shift = error; child = rcu_dereference_raw(root->xa_head); } while (shift > 0) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return -ENOMEM; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; /* Go a level down */ node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); slot = &node->slots[offset]; } if (nodep) *nodep = node; if (slotp) *slotp = slot; return 0; } /* * Free any nodes below this node. The tree is presumed to not need * shrinking, and any user data in the tree is presumed to not need a * destructor called on it. If we need to add a destructor, we can * add that functionality later. Note that we may not clear tags or * slots from the tree as an RCU walker may still have a pointer into * this subtree. We could replace the entries with RADIX_TREE_RETRY, * but we'll still have to clear those in rcu_free. */ static void radix_tree_free_nodes(struct radix_tree_node *node) { unsigned offset = 0; struct radix_tree_node *child = entry_to_node(node); for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); if (xa_is_node(entry) && child->shift) { child = entry_to_node(entry); offset = 0; continue; } offset++; while (offset == RADIX_TREE_MAP_SIZE) { struct radix_tree_node *old = child; offset = child->offset + 1; child = child->parent; WARN_ON_ONCE(!list_empty(&old->private_list)); radix_tree_node_free(old); if (old == entry_to_node(node)) return; } } } static inline int insert_entries(struct radix_tree_node *node, void __rcu **slot, void *item) { if (*slot) return -EEXIST; rcu_assign_pointer(*slot, item); if (node) { node->count++; if (xa_is_value(item)) node->nr_values++; } return 1; } /** * radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key * @item: item to insert * * Insert an item into the radix tree at position @index. */ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; void __rcu **slot; int error; BUG_ON(radix_tree_is_internal_node(item)); error = __radix_tree_create(root, index, &node, &slot); if (error) return error; error = insert_entries(node, slot, item); if (error < 0) return error; if (node) { unsigned offset = get_slot_offset(node, slot); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); } else { BUG_ON(root_tags_get(root)); } return 0; } EXPORT_SYMBOL(radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree * @root: radix tree root * @index: index key * @nodep: returns node * @slotp: returns slot * * Lookup and return the item at position @index in the radix * tree @root. * * Until there is more than one item in the tree, no nodes are * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. */ void *__radix_tree_lookup(const struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node, *parent; unsigned long maxindex; void __rcu **slot; restart: parent = NULL; slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); slot = parent->slots + offset; if (node == RADIX_TREE_RETRY) goto restart; if (parent->shift == 0) break; } if (nodep) *nodep = parent; if (slotp) *slotp = slot; return node; } /** * radix_tree_lookup_slot - lookup a slot in a radix tree * @root: radix tree root * @index: index key * * Returns: the slot corresponding to the position @index in the * radix tree @root. This is useful for update-if-exists operations. * * This function can be called under rcu_read_lock iff the slot is not * modified by radix_tree_replace_slot, otherwise it must be called * exclusive from other writers. Any dereference of the slot must be done * using radix_tree_deref_slot. */ void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root, unsigned long index) { void __rcu **slot; if (!__radix_tree_lookup(root, index, NULL, &slot)) return NULL; return slot; } EXPORT_SYMBOL(radix_tree_lookup_slot); /** * radix_tree_lookup - perform lookup operation on a radix tree * @root: radix tree root * @index: index key * * Lookup the item at the position @index in the radix tree @root. * * This function can be called under rcu_read_lock, however the caller * must manage lifetimes of leaf nodes (eg. RCU may also be used to free * them safely). No RCU barriers are required to access or modify the * returned item, however. */ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) { return __radix_tree_lookup(root, index, NULL, NULL); } EXPORT_SYMBOL(radix_tree_lookup); static void replace_slot(void __rcu **slot, void *item, struct radix_tree_node *node, int count, int values) { if (node && (count || values)) { node->count += count; node->nr_values += values; } rcu_assign_pointer(*slot, item); } static bool node_tag_get(const struct radix_tree_root *root, const struct radix_tree_node *node, unsigned int tag, unsigned int offset) { if (node) return tag_get(node, tag, offset); return root_tag_get(root, tag); } /* * IDR users want to be able to store NULL in the tree, so if the slot isn't * free, don't adjust the count, even if it's transitioning between NULL and * non-NULL. For the IDA, we mark slots as being IDR_FREE while they still * have empty bits, but it only stores NULL in slots when they're being * deleted. */ static int calculate_count(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item, void *old) { if (is_idr(root)) { unsigned offset = get_slot_offset(node, slot); bool free = node_tag_get(root, node, IDR_FREE, offset); if (!free) return 0; if (!old) return 1; } return !!item - !!old; } /** * __radix_tree_replace - replace item in a slot * @root: radix tree root * @node: pointer to tree node * @slot: pointer to slot in @node * @item: new item to store in the slot. * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item) { void *old = rcu_dereference_raw(*slot); int values = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* * This function supports replacing value entries and * deleting entries, but that needs accounting against the * node unless the slot is root->xa_head. */ WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && (count || values)); replace_slot(slot, item, node, count, values); if (!node) return; delete_node(root, node); } /** * radix_tree_replace_slot - replace item in a slot * @root: radix tree root * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_lookup_slot() and * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked * across slot lookup and replacement. * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and value entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or * deleting, use __radix_tree_lookup() and __radix_tree_replace() or * radix_tree_iter_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { __radix_tree_replace(root, NULL, slot, item); } EXPORT_SYMBOL(radix_tree_replace_slot); /** * radix_tree_iter_replace - replace item in a slot * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * @item: new item to store in the slot. * * For use with radix_tree_for_each_slot(). * Caller must hold tree write locked. */ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { __radix_tree_replace(root, iter->node, slot, item); } static void node_tag_set(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (tag_get(node, tag, offset)) return; tag_set(node, tag, offset); offset = node->offset; node = node->parent; } if (!root_tag_get(root, tag)) root_tag_set(root, tag); } /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Set the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. From * the root all the way down to the leaf node. * * Returns the address of the tagged item. Setting a tag on a not-present * item is a bug. */ void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; radix_tree_load_root(root, &node, &maxindex); BUG_ON(index > maxindex); while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); BUG_ON(!node); if (!tag_get(parent, tag, offset)) tag_set(parent, tag, offset); } /* set the root's tag bit */ if (!root_tag_get(root, tag)) root_tag_set(root, tag); return node; } EXPORT_SYMBOL(radix_tree_tag_set); static void node_tag_clear(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) { while (node) { if (!tag_get(node, tag, offset)) return; tag_clear(node, tag, offset); if (any_tag_set(node, tag)) return; offset = node->offset; node = node->parent; } /* clear the root's tag bit */ if (root_tag_get(root, tag)) root_tag_clear(root, tag); } /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index * * Clear the search tag (which must be < RADIX_TREE_MAX_TAGS) * corresponding to @index in the radix tree. If this causes * the leaf node to have no tags set then clear the tag in the * next-to-leaf node, etc. * * Returns the address of the tagged item on success, else NULL. ie: * has the same return value and semantics as radix_tree_lookup(). */ void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; int offset = 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; parent = NULL; while (radix_tree_is_internal_node(node)) { parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); } if (node) node_tag_clear(root, parent, tag, offset); return node; } EXPORT_SYMBOL(radix_tree_tag_clear); /** * radix_tree_iter_tag_clear - clear a tag on the current iterator entry * @root: radix tree root * @iter: iterator state * @tag: tag to clear */ void radix_tree_iter_tag_clear(struct radix_tree_root *root, const struct radix_tree_iter *iter, unsigned int tag) { node_tag_clear(root, iter->node, tag, iter_offset(iter)); } /** * radix_tree_tag_get - get a tag on a radix tree node * @root: radix tree root * @index: index key * @tag: tag index (< RADIX_TREE_MAX_TAGS) * * Return values: * * 0: tag not present or not set * 1: tag set * * Note that the return value of this function may not be relied on, even if * the RCU lock is held, unless tag modification and node deletion are excluded * from concurrency. */ int radix_tree_tag_get(const struct radix_tree_root *root, unsigned long index, unsigned int tag) { struct radix_tree_node *node, *parent; unsigned long maxindex; if (!root_tag_get(root, tag)) return 0; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return 0; while (radix_tree_is_internal_node(node)) { unsigned offset; parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); if (!tag_get(parent, tag, offset)) return 0; if (node == RADIX_TREE_RETRY) break; } return 1; } EXPORT_SYMBOL(radix_tree_tag_get); /* Construct iter->tags bit-mask from node->tags[tag] array */ static void set_iter_tags(struct radix_tree_iter *iter, struct radix_tree_node *node, unsigned offset, unsigned tag) { unsigned tag_long = offset / BITS_PER_LONG; unsigned tag_bit = offset % BITS_PER_LONG; if (!node) { iter->tags = 1; return; } iter->tags = node->tags[tag][tag_long] >> tag_bit; /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ if (tag_long < RADIX_TREE_TAG_LONGS - 1) { /* Pick tags from next element */ if (tag_bit) iter->tags |= node->tags[tag][tag_long + 1] << (BITS_PER_LONG - tag_bit); /* Clip chunk size, here only BITS_PER_LONG tags */ iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); } } void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { iter->index = __radix_tree_iter_add(iter, 1); iter->next_index = iter->index; iter->tags = 0; return NULL; } EXPORT_SYMBOL(radix_tree_iter_resume); /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if iteration is over */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned flags) { unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; struct radix_tree_node *node, *child; unsigned long index, offset, maxindex; if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag)) return NULL; /* * Catch next_index overflow after ~0UL. iter->index never overflows * during iterating; it can be zero only at the beginning. * And we cannot overflow iter->next_index in a single step, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. * * This condition also used by radix_tree_next_slot() to stop * contiguous iterating, and forbid switching to the next chunk. */ index = iter->next_index; if (!index && iter->index) return NULL; restart: radix_tree_load_root(root, &child, &maxindex); if (index > maxindex) return NULL; if (!child) return NULL; if (!radix_tree_is_internal_node(child)) { /* Single-slot tree */ iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; iter->node = NULL; return (void __rcu **)&root->xa_head; } do { node = entry_to_node(child); offset = radix_tree_descend(node, &child, index); if ((flags & RADIX_TREE_ITER_TAGGED) ? !tag_get(node, tag, offset) : !child) { /* Hole detected */ if (flags & RADIX_TREE_ITER_CONTIG) return NULL; if (flags & RADIX_TREE_ITER_TAGGED) offset = radix_tree_find_next_bit(node, tag, offset + 1); else while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); if (slot) break; } index &= ~node_maxindex(node); index += offset << node->shift; /* Overflow after ~0UL */ if (!index) return NULL; if (offset == RADIX_TREE_MAP_SIZE) goto restart; child = rcu_dereference_raw(node->slots[offset]); } if (!child) goto restart; if (child == RADIX_TREE_RETRY) break; } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | offset; iter->next_index = (index | node_maxindex(node)) + 1; iter->node = node; if (flags & RADIX_TREE_ITER_TAGGED) set_iter_tags(iter, node, offset, tag); return node->slots + offset; } EXPORT_SYMBOL(radix_tree_next_chunk); /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * * Performs an index-ascending scan of the tree for present items. Places * them at *@results and returns the number of items which were placed at * *@results. * * The implementation is naive. * * Like radix_tree_lookup, radix_tree_gang_lookup may be called under * rcu_read_lock. In this case, rather than the returned results being * an atomic snapshot of the tree at a single point in time, the * semantics of an RCU protected gang lookup are as though multiple * radix_tree_lookups have been issued in individual locks, and results * stored in 'results'. */ unsigned int radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_slot(slot, root, &iter, first_index) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup); /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the items at *@results and * returns the number of items which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = rcu_dereference_raw(*slot); if (!results[ret]) continue; if (radix_tree_is_internal_node(results[ret])) { slot = radix_tree_iter_retry(&iter); continue; } if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag); /** * radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a * radix tree based on a tag * @root: radix tree root * @results: where the results of the lookup are placed * @first_index: start the lookup from this key * @max_items: place up to this many items at *results * @tag: the tag index (< RADIX_TREE_MAX_TAGS) * * Performs an index-ascending scan of the tree for present items which * have the tag indexed by @tag set. Places the slots at *@results and * returns the number of slots which were placed at *@results. */ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag) { struct radix_tree_iter iter; void __rcu **slot; unsigned int ret = 0; if (unlikely(!max_items)) return 0; radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) { results[ret] = slot; if (++ret == max_items) break; } return ret; } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); int values = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; if (is_idr(root)) node_tag_set(root, node, IDR_FREE, offset); else for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, values); return node && delete_node(root, node); } /** * radix_tree_iter_delete - delete the entry at this iterator position * @root: radix tree root * @iter: iterator state * @slot: pointer to slot * * Delete the entry at the position currently pointed to by the iterator. * This may result in the current node being freed; if it is, the iterator * is advanced so that it will not reference the freed memory. This * function may be called without any locking if there are no other threads * which can access this tree. */ void radix_tree_iter_delete(struct radix_tree_root *root, struct radix_tree_iter *iter, void __rcu **slot) { if (__radix_tree_delete(root, iter->node, slot)) iter->index = iter->next_index; } EXPORT_SYMBOL(radix_tree_iter_delete); /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root * @index: index key * @item: expected item * * Remove @item at @index from the radix tree rooted at @root. * * Return: the deleted entry, or %NULL if it was not present * or the entry at the given @index was not @item. */ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node = NULL; void __rcu **slot = NULL; void *entry; entry = __radix_tree_lookup(root, index, &node, &slot); if (!slot) return NULL; if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE, get_slot_offset(node, slot)))) return NULL; if (item && entry != item) return NULL; __radix_tree_delete(root, node, slot); return entry; } EXPORT_SYMBOL(radix_tree_delete_item); /** * radix_tree_delete - delete an entry from a radix tree * @root: radix tree root * @index: index key * * Remove the entry at @index from the radix tree rooted at @root. * * Return: The deleted entry, or %NULL if it was not present. */ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) { return radix_tree_delete_item(root, index, NULL); } EXPORT_SYMBOL(radix_tree_delete); /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test */ int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag) { return root_tag_get(root, tag); } EXPORT_SYMBOL(radix_tree_tagged); /** * idr_preload - preload for idr_alloc() * @gfp_mask: allocation mask to use for preloading * * Preallocate memory to use for the next call to idr_alloc(). This function * returns with preemption disabled. It will be enabled by idr_preload_end(). */ void idr_preload(gfp_t gfp_mask) { if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) local_lock(&radix_tree_preloads.lock); } EXPORT_SYMBOL(idr_preload); void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex, start = iter->next_index; unsigned int shift, offset = 0; grow: shift = radix_tree_load_root(root, &child, &maxindex); if (!radix_tree_tagged(root, IDR_FREE)) start = max(start, maxindex + 1); if (start > max) return ERR_PTR(-ENOSPC); if (start > maxindex) { int error = radix_tree_extend(root, gfp, start, shift); if (error < 0) return ERR_PTR(error); shift = error; child = rcu_dereference_raw(root->xa_head); } if (start == 0 && shift == 0) shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ child = radix_tree_node_alloc(gfp, node, root, shift, offset, 0, 0); if (!child) return ERR_PTR(-ENOMEM); all_tag_set(child, IDR_FREE); rcu_assign_pointer(*slot, node_to_entry(child)); if (node) node->count++; } else if (!radix_tree_is_internal_node(child)) break; node = entry_to_node(child); offset = radix_tree_descend(node, &child, start); if (!tag_get(node, IDR_FREE, offset)) { offset = radix_tree_find_next_bit(node, IDR_FREE, offset + 1); start = next_index(start, node, offset); if (start > max || start == 0) return ERR_PTR(-ENOSPC); while (offset == RADIX_TREE_MAP_SIZE) { offset = node->offset + 1; node = node->parent; if (!node) goto grow; shift = node->shift; } child = rcu_dereference_raw(node->slots[offset]); } slot = &node->slots[offset]; } iter->index = start; if (node) iter->next_index = 1 + min(max, (start | node_maxindex(node))); else iter->next_index = 1; iter->node = node; set_iter_tags(iter, node, offset, IDR_FREE); return slot; } /** * idr_destroy - release all internal memory from an IDR * @idr: idr handle * * After this function is called, the IDR is empty, and may be reused or * the data structure containing it may be freed. * * A typical clean-up sequence for objects stored in an idr tree will use * idr_for_each() to free all objects, if necessary, then idr_destroy() to * free the memory used to keep track of those objects. */ void idr_destroy(struct idr *idr) { struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head); if (radix_tree_is_internal_node(node)) radix_tree_free_nodes(node); idr->idr_rt.xa_head = NULL; root_tag_set(&idr->idr_rt, IDR_FREE); } EXPORT_SYMBOL(idr_destroy); static void radix_tree_node_ctor(void *arg) { struct radix_tree_node *node = arg; memset(node, 0, sizeof(*node)); INIT_LIST_HEAD(&node->private_list); } static int radix_tree_cpu_dead(unsigned int cpu) { struct radix_tree_preload *rtp; struct radix_tree_node *node; /* Free per-cpu pool of preloaded nodes */ rtp = &per_cpu(radix_tree_preloads, cpu); while (rtp->nr) { node = rtp->nodes; rtp->nodes = node->parent; kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; } return 0; } void __init radix_tree_init(void) { int ret; BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); BUILD_BUG_ON(XA_CHUNK_SIZE > 255); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", NULL, radix_tree_cpu_dead); WARN_ON(ret < 0); }
30 14 10 8 8 8 30 30 30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 // SPDX-License-Identifier: GPL-2.0-only /* * common LSM auditing functions * * Based on code written for SELinux by : * Stephen Smalley * James Morris <jmorris@redhat.com> * Author : Etienne Basset, <etienne.basset@ensta.org> */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> #include <linux/init.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/sctp.h> #include <linux/lsm_audit.h> #include <linux/security.h> /** * ipv4_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int ret = 0; struct iphdr *ih; ih = ip_hdr(skb); ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; if (proto) *proto = ih->protocol; /* non initial fragment */ if (ntohs(ih->frag_off) & IP_OFFSET) return 0; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr *th = tcp_hdr(skb); ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr *uh = udp_hdr(skb); ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #if IS_ENABLED(CONFIG_IPV6) /** * ipv6_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ret = 0; struct ipv6hdr *ip6; u8 nexthdr; __be16 frag_off; ip6 = ipv6_hdr(skb); ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; /* IPv6 can have several extension header before the Transport header * skip them */ offset = skb_network_offset(skb); offset += sizeof(*ip6); nexthdr = ip6->nexthdr; offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return 0; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #endif static inline void print_ipv6_addr(struct audit_buffer *ab, const struct in6_addr *addr, __be16 port, const char *name1, const char *name2) { if (!ipv6_addr_any(addr)) audit_log_format(ab, " %s=%pI6c", name1, addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr, __be16 port, const char *name1, const char *name2) { if (addr) audit_log_format(ab, " %s=%pI4", name1, &addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } /** * audit_log_lsm_data - helper to log common LSM audit data * @ab : the audit buffer * @a : common audit data */ void audit_log_lsm_data(struct audit_buffer *ab, const struct common_audit_data *a) { /* * To keep stack sizes in check force programmers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); switch (a->type) { case LSM_AUDIT_DATA_NONE: return; case LSM_AUDIT_DATA_IPC: audit_log_format(ab, " ipc_key=%d ", a->u.ipc_id); break; case LSM_AUDIT_DATA_CAP: audit_log_format(ab, " capability=%d ", a->u.cap); break; case LSM_AUDIT_DATA_PATH: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.path); inode = d_backing_inode(a->u.path.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_FILE: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.file->f_path); inode = file_inode(a->u.file); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_IOCTL_OP: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.op->path); inode = a->u.op->path.dentry->d_inode; if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); break; } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; audit_log_format(ab, " name="); spin_lock(&a->u.dentry->d_lock); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); spin_unlock(&a->u.dentry->d_lock); inode = d_backing_inode(a->u.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_INODE: { struct dentry *dentry; struct inode *inode; rcu_read_lock(); inode = a->u.inode; dentry = d_find_alias_rcu(inode); if (dentry) { audit_log_format(ab, " name="); spin_lock(&dentry->d_lock); audit_log_untrustedstring(ab, dentry->d_name.name); spin_unlock(&dentry->d_lock); } audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); rcu_read_unlock(); break; } case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { pid_t pid = task_tgid_nr(tsk); if (pid) { char tskcomm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); audit_log_untrustedstring(ab, get_task_comm(tskcomm, tsk)); } } break; } case LSM_AUDIT_DATA_NET: if (a->u.net->sk) { const struct sock *sk = a->u.net->sk; const struct unix_sock *u; struct unix_address *addr; int len = 0; char *p = NULL; switch (sk->sk_family) { case AF_INET: { const struct inet_sock *inet = inet_sk(sk); print_ipv4_addr(ab, inet->inet_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv4_addr(ab, inet->inet_daddr, inet->inet_dport, "faddr", "fport"); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { const struct inet_sock *inet = inet_sk(sk); print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv6_addr(ab, &sk->sk_v6_daddr, inet->inet_dport, "faddr", "fport"); break; } #endif case AF_UNIX: u = unix_sk(sk); addr = smp_load_acquire(&u->addr); if (!addr) break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } len = addr->len-sizeof(short); p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); else audit_log_n_hex(ab, p, len); break; } } switch (a->u.net->family) { case AF_INET: print_ipv4_addr(ab, a->u.net->v4info.saddr, a->u.net->sport, "saddr", "src"); print_ipv4_addr(ab, a->u.net->v4info.daddr, a->u.net->dport, "daddr", "dest"); break; case AF_INET6: print_ipv6_addr(ab, &a->u.net->v6info.saddr, a->u.net->sport, "saddr", "src"); print_ipv6_addr(ab, &a->u.net->v6info.daddr, a->u.net->dport, "daddr", "dest"); break; } if (a->u.net->netif > 0) { struct net_device *dev; /* NOTE: we always use init's namespace */ dev = dev_get_by_index(&init_net, a->u.net->netif); if (dev) { audit_log_format(ab, " netif=%s", dev->name); dev_put(dev); } } break; #ifdef CONFIG_KEYS case LSM_AUDIT_DATA_KEY: audit_log_format(ab, " key_serial=%u", a->u.key_struct.key); if (a->u.key_struct.key_desc) { audit_log_format(ab, " key_desc="); audit_log_untrustedstring(ab, a->u.key_struct.key_desc); } break; #endif case LSM_AUDIT_DATA_KMOD: audit_log_format(ab, " kmod="); audit_log_untrustedstring(ab, a->u.kmod_name); break; case LSM_AUDIT_DATA_IBPKEY: { struct in6_addr sbn_pfx; memset(&sbn_pfx.s6_addr, 0, sizeof(sbn_pfx.s6_addr)); memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix, sizeof(a->u.ibpkey->subnet_prefix)); audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c", a->u.ibpkey->pkey, &sbn_pfx); break; } case LSM_AUDIT_DATA_IBENDPORT: audit_log_format(ab, " device=%s port_num=%u", a->u.ibendport->dev_name, a->u.ibendport->port); break; case LSM_AUDIT_DATA_LOCKDOWN: audit_log_format(ab, " lockdown_reason=\"%s\"", lockdown_reasons[a->u.reason]); break; case LSM_AUDIT_DATA_ANONINODE: audit_log_format(ab, " anonclass=%s", a->u.anonclass); break; case LSM_AUDIT_DATA_NLMSGTYPE: audit_log_format(ab, " nl-msgtype=%hu", a->u.nlmsg_type); break; } /* switch (a->type) */ } /** * dump_common_audit_data - helper to dump common audit data * @ab : the audit buffer * @a : common audit data */ static void dump_common_audit_data(struct audit_buffer *ab, const struct common_audit_data *a) { char comm[sizeof(current->comm)]; audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, get_task_comm(comm, current)); audit_log_lsm_data(ab, a); } /** * common_lsm_audit - generic LSM auditing function * @a: auxiliary audit data * @pre_audit: lsm-specific pre-audit callback * @post_audit: lsm-specific post-audit callback * * setup the audit buffer for common security information * uses callback to print LSM specific information */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)) { struct audit_buffer *ab; if (a == NULL) return; /* we use GFP_ATOMIC so we won't sleep */ ab = audit_log_start(audit_context(), GFP_ATOMIC | __GFP_NOWARN, AUDIT_AVC); if (ab == NULL) return; if (pre_audit) pre_audit(ab, a); dump_common_audit_data(ab, a); if (post_audit) post_audit(ab, a); audit_log_end(ab); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: Internet Group Management Protocol [IGMP] * * This code implements the IGMP protocol as defined in RFC1112. There has * been a further revision of this protocol since which is now supported. * * If you have trouble with this module be careful what gcc you have used, * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Fixes: * * Alan Cox : Added lots of __inline__ to optimise * the memory usage of all the tiny little * functions. * Alan Cox : Dumped the header building experiment. * Alan Cox : Minor tweaks ready for multicast routing * and extended IGMP protocol. * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 * writes utterly bogus code otherwise (sigh) * fixed IGMP loopback to behave in the manner * desired by mrouted, fixed the fact it has been * broken since 1.3.6 and cleaned up a few minor * points. * * Chih-Jen Chang : Tried to revise IGMP to Version 2 * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu * The enhancements are mainly based on Steve Deering's * ipmulti-3.5 source code. * Chih-Jen Chang : Added the igmp_get_mrouter_info and * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of * the mrouted version on that device. * Chih-Jen Chang : Added the max_resp_time parameter to * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter * to identify the multicast router version * and do what the IGMP version 2 specified. * Chih-Jen Chang : Added a timer to revert to IGMP V2 router * Tsu-Sheng Tsao if the specified time expired. * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. * Alan Cox : Use GFP_ATOMIC in the right places. * Christian Daudt : igmp timer wasn't set for local group * memberships but was being deleted, * which caused a "del_timer() called * from %p with timer not initialized\n" * message (960131). * Christian Daudt : removed del_timer from * igmp_timer_expire function (960205). * Christian Daudt : igmp_heard_report now only calls * igmp_timer_expire if tm->running is * true (960216). * Malcolm Beattie : ttl comparison wrong in igmp_rcv made * igmp_heard_query never trigger. Expiry * miscalculation fixed in igmp_heard_query * and random() made to return unsigned to * prevent negative expiry times. * Alexey Kuznetsov: Wrong group leaving behaviour, backport * fix from pending 2.1.x patches. * Alan Cox: Forget to enable FDDI support earlier. * Alexey Kuznetsov: Fixed leaving groups on device down. * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. * David L Stevens: IGMPv3 support, with help from * Vinay Kulkarni */ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include "igmp_internal.h" #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> #include <linux/pkt_sched.h> #include <linux/byteorder/generic.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/addrconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/sock.h> #include <net/checksum.h> #include <net/inet_common.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif #ifdef CONFIG_PROC_FS #include <linux/proc_fs.h> #include <linux/seq_file.h> #endif #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ #define IGMP_QUERY_INTERVAL (125*HZ) #define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ) #define IGMP_INITIAL_REPORT_DELAY (1) /* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs! * IGMP specs require to report membership immediately after * joining a group, but we delay the first report by a * small interval. It seems more natural and still does not * contradict to specs provided this delay is small enough. */ #define IGMP_V1_SEEN(in_dev) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ ((in_dev)->mr_v1_seen && \ time_before(jiffies, (in_dev)->mr_v1_seen))) #define IGMP_V2_SEEN(in_dev) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ ((in_dev)->mr_v2_seen && \ time_before(jiffies, (in_dev)->mr_v2_seen))) static int unsolicited_report_interval(struct in_device *in_dev) { int interval_ms, interval_jiffies; if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV2_UNSOLICITED_REPORT_INTERVAL); else /* v3 */ interval_ms = IN_DEV_CONF_GET( in_dev, IGMPV3_UNSOLICITED_REPORT_INTERVAL); interval_jiffies = msecs_to_jiffies(interval_ms); /* _timer functions can't handle a delay of 0 jiffies so ensure * we always return a positive value. */ if (interval_jiffies <= 0) interval_jiffies = 1; return interval_jiffies; } static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp); static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im); static void igmpv3_clear_delrec(struct in_device *in_dev); static int sf_setstate(struct ip_mc_list *pmc); static void sf_markstate(struct ip_mc_list *pmc); #endif static void ip_mc_clear_src(struct ip_mc_list *pmc); static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta); static void ip_ma_put(struct ip_mc_list *im) { if (refcount_dec_and_test(&im->refcnt)) { in_dev_put(im->interface); kfree_rcu(im, rcu); } } #define for_each_pmc_rcu(in_dev, pmc) \ for (pmc = rcu_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rcu_dereference(pmc->next_rcu)) #define for_each_pmc_rtnl(in_dev, pmc) \ for (pmc = rtnl_dereference(in_dev->mc_list); \ pmc != NULL; \ pmc = rtnl_dereference(pmc->next_rcu)) static void ip_sf_list_clear_all(struct ip_sf_list *psf) { struct ip_sf_list *next; while (psf) { next = psf->sf_next; kfree(psf); psf = next; } } #ifdef CONFIG_IP_MULTICAST /* * Timer management */ static void igmp_stop_timer(struct ip_mc_list *im) { spin_lock_bh(&im->lock); if (timer_delete(&im->timer)) refcount_dec(&im->refcnt); im->tm_running = 0; im->reporter = 0; im->unsolicit_count = 0; spin_unlock_bh(&im->lock); } /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { int tv = get_random_u32_below(max_delay); im->tm_running = 1; if (refcount_inc_not_zero(&im->refcnt)) { if (mod_timer(&im->timer, jiffies + tv + 2)) ip_ma_put(im); } } static void igmp_gq_start_timer(struct in_device *in_dev) { int tv = get_random_u32_below(in_dev->mr_maxdelay); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && time_after_eq(exp, (in_dev->mr_gq_timer).expires)) return; in_dev->mr_gq_running = 1; if (!mod_timer(&in_dev->mr_gq_timer, exp)) in_dev_hold(in_dev); } static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { int tv = get_random_u32_below(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); } static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) { spin_lock_bh(&im->lock); im->unsolicit_count = 0; if (timer_delete(&im->timer)) { if ((long)(im->timer.expires-jiffies) < max_delay) { add_timer(&im->timer); im->tm_running = 1; spin_unlock_bh(&im->lock); return; } refcount_dec(&im->refcnt); } igmp_start_timer(im, max_delay); spin_unlock_bh(&im->lock); } /* * Send an IGMP report. */ #define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, int gdeleted, int sdeleted) { switch (type) { case IGMPV3_MODE_IS_INCLUDE: case IGMPV3_MODE_IS_EXCLUDE: if (gdeleted || sdeleted) return 0; if (!(pmc->gsquery && !psf->sf_gsresp)) { if (pmc->sfmode == MCAST_INCLUDE) return 1; /* don't include if this source is excluded * in all filters */ if (psf->sf_count[MCAST_INCLUDE]) return type == IGMPV3_MODE_IS_INCLUDE; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; } return 0; case IGMPV3_CHANGE_TO_INCLUDE: if (gdeleted || sdeleted) return 0; return psf->sf_count[MCAST_INCLUDE] != 0; case IGMPV3_CHANGE_TO_EXCLUDE: if (gdeleted || sdeleted) return 0; if (pmc->sfcount[MCAST_EXCLUDE] == 0 || psf->sf_count[MCAST_INCLUDE]) return 0; return pmc->sfcount[MCAST_EXCLUDE] == psf->sf_count[MCAST_EXCLUDE]; case IGMPV3_ALLOW_NEW_SOURCES: if (gdeleted || !psf->sf_crcount) return 0; return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted; case IGMPV3_BLOCK_OLD_SOURCES: if (pmc->sfmode == MCAST_INCLUDE) return gdeleted || (psf->sf_crcount && sdeleted); return psf->sf_crcount && !gdeleted && !sdeleted; } return 0; } static int igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct ip_sf_list *psf; int scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (!is_in(pmc, psf, type, gdeleted, sdeleted)) continue; scount++; } return scount; } /* source address selection per RFC 3376 section 4.2.13 */ static __be32 igmpv3_get_srcaddr(struct net_device *dev, const struct flowi4 *fl4) { struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) return htonl(INADDR_ANY); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (fl4->saddr == ifa->ifa_local) return fl4->saddr; } return htonl(INADDR_ANY); } static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) { struct sk_buff *skb; struct rtable *rt; struct iphdr *pip; struct igmpv3_report *pig; struct net *net = dev_net(dev); struct flowi4 fl4; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int size; size = min(mtu, IP_MAX_MTU); while (1) { skb = alloc_skb(size + hlen + tlen, GFP_ATOMIC | __GFP_NOWARN); if (skb) break; size >>= 1; if (size < 256) return NULL; } skb->priority = TC_PRIO_CONTROL; rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) { kfree_skb(skb); return NULL; } skb_dst_set(skb, &rt->dst); skb->dev = dev; skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); skb_reset_network_header(skb); pip = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); pip->version = 4; pip->ihl = (sizeof(struct iphdr)+4)>>2; pip->tos = 0xc0; pip->frag_off = htons(IP_DF); pip->ttl = 1; pip->daddr = fl4.daddr; rcu_read_lock(); pip->saddr = igmpv3_get_srcaddr(dev, &fl4); rcu_read_unlock(); pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ ip_select_ident(net, skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; ((u8 *)&pip[1])[3] = 0; skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4; skb_put(skb, sizeof(*pig)); pig = igmpv3_report_hdr(skb); pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; pig->resv1 = 0; pig->csum = 0; pig->resv2 = 0; pig->ngrec = 0; return skb; } static int igmpv3_sendpack(struct sk_buff *skb) { struct igmphdr *pig = igmp_hdr(skb); const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb); pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) { return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel); } static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, int type, struct igmpv3_grec **ppgr, unsigned int mtu) { struct net_device *dev = pmc->interface->dev; struct igmpv3_report *pih; struct igmpv3_grec *pgr; if (!skb) { skb = igmpv3_newpack(dev, mtu); if (!skb) return NULL; } pgr = skb_put(skb, sizeof(struct igmpv3_grec)); pgr->grec_type = type; pgr->grec_auxwords = 0; pgr->grec_nsrcs = 0; pgr->grec_mca = pmc->multiaddr; pih = igmpv3_report_hdr(skb); pih->ngrec = htons(ntohs(pih->ngrec)+1); *ppgr = pgr; return skb; } #define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0) static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; int scount, stotal, first, isquery, truncate; unsigned int mtu; if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return skb; mtu = READ_ONCE(dev->mtu); if (mtu < IPV4_MIN_MTU) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; truncate = type == IGMPV3_MODE_IS_EXCLUDE || type == IGMPV3_CHANGE_TO_EXCLUDE; stotal = scount = 0; psf_list = sdeleted ? &pmc->tomb : &pmc->sources; if (!*psf_list) goto empty_source; pih = skb ? igmpv3_report_hdr(skb) : NULL; /* EX and TO_EX get a fresh packet, if needed */ if (truncate) { if (pih && pih->ngrec && AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); } } first = 1; psf_prev = NULL; for (psf = *psf_list; psf; psf = psf_next) { __be32 *psrc; psf_next = psf->sf_next; if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { psf_prev = psf; continue; } /* Based on RFC3376 5.1. Should not send source-list change * records when there is a filter mode change. */ if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) || (!gdeleted && pmc->crcount)) && (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) goto decrease_sf_crcount; /* clear marks on query responses */ if (isquery) psf->sf_gsresp = 0; if (AVAILABLE(skb) < sizeof(__be32) + first*sizeof(struct igmpv3_grec)) { if (truncate && !first) break; /* truncate these */ if (pgr) pgr->grec_nsrcs = htons(scount); if (skb) igmpv3_sendpack(skb); skb = igmpv3_newpack(dev, mtu); first = 1; scount = 0; } if (first) { skb = add_grhead(skb, pmc, type, &pgr, mtu); first = 0; } if (!skb) return NULL; psrc = skb_put(skb, sizeof(__be32)); *psrc = psf->sf_inaddr; scount++; stotal++; if ((type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { decrease_sf_crcount: psf->sf_crcount--; if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *psf_list = psf->sf_next; kfree(psf); continue; } } psf_prev = psf; } empty_source: if (!stotal) { if (type == IGMPV3_ALLOW_NEW_SOURCES || type == IGMPV3_BLOCK_OLD_SOURCES) return skb; if (pmc->crcount || isquery) { /* make sure we have room for group header */ if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) { igmpv3_sendpack(skb); skb = NULL; /* add_grhead will get a new one */ } skb = add_grhead(skb, pmc, type, &pgr, mtu); } } if (pgr) pgr->grec_nsrcs = htons(scount); if (isquery) pmc->gsquery = 0; /* clear query state on report */ return skb; } static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); } else { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; else type = IGMPV3_MODE_IS_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); spin_unlock_bh(&pmc->lock); } if (!skb) return 0; return igmpv3_sendpack(skb); } /* * remove zero-count source records from a source filter list */ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) { struct ip_sf_list *psf_prev, *psf_next, *psf; psf_prev = NULL; for (psf = *ppsf; psf; psf = psf_next) { psf_next = psf->sf_next; if (psf->sf_crcount == 0) { if (psf_prev) psf_prev->sf_next = psf->sf_next; else *ppsf = psf->sf_next; kfree(psf); } else psf_prev = psf; } } static void kfree_pmc(struct ip_mc_list *pmc) { ip_sf_list_clear_all(pmc->sources); ip_sf_list_clear_all(pmc->tomb); kfree(pmc); } static void igmpv3_send_cr(struct in_device *in_dev) { struct ip_mc_list *pmc, *pmc_prev, *pmc_next; struct sk_buff *skb = NULL; int type, dtype; rcu_read_lock(); spin_lock_bh(&in_dev->mc_tomb_lock); /* deleted MCA's */ pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) { pmc_next = pmc->next; if (pmc->sfmode == MCAST_INCLUDE) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; skb = add_grec(skb, pmc, type, 1, 0); skb = add_grec(skb, pmc, dtype, 1, 1); } if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) { type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 1, 0); } pmc->crcount--; if (pmc->crcount == 0) { igmpv3_clear_zeros(&pmc->tomb); igmpv3_clear_zeros(&pmc->sources); } } if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) { if (pmc_prev) pmc_prev->next = pmc_next; else in_dev->mc_tomb = pmc_next; in_dev_put(pmc->interface); kfree_pmc(pmc); } else pmc_prev = pmc; } spin_unlock_bh(&in_dev->mc_tomb_lock); /* change recs */ for_each_pmc_rcu(in_dev, pmc) { spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) { type = IGMPV3_BLOCK_OLD_SOURCES; dtype = IGMPV3_ALLOW_NEW_SOURCES; } else { type = IGMPV3_ALLOW_NEW_SOURCES; dtype = IGMPV3_BLOCK_OLD_SOURCES; } skb = add_grec(skb, pmc, type, 0, 0); skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ /* filter mode changes */ if (pmc->crcount) { if (pmc->sfmode == MCAST_EXCLUDE) type = IGMPV3_CHANGE_TO_EXCLUDE; else type = IGMPV3_CHANGE_TO_INCLUDE; skb = add_grec(skb, pmc, type, 0, 0); pmc->crcount--; } spin_unlock_bh(&pmc->lock); } rcu_read_unlock(); if (!skb) return; (void) igmpv3_sendpack(skb); } static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, int type) { struct sk_buff *skb; struct iphdr *iph; struct igmphdr *ih; struct rtable *rt; struct net_device *dev = in_dev->dev; struct net *net = dev_net(dev); __be32 group = pmc ? pmc->multiaddr : 0; struct flowi4 fl4; __be32 dst; int hlen, tlen; if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; rt = ip_route_output_ports(net, &fl4, NULL, dst, 0, 0, 0, IPPROTO_IGMP, 0, dev->ifindex); if (IS_ERR(rt)) return -1; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC); if (!skb) { ip_rt_put(rt); return -1; } skb->priority = TC_PRIO_CONTROL; skb_dst_set(skb, &rt->dst); skb_reserve(skb, hlen); skb_reset_network_header(skb); iph = ip_hdr(skb); skb_put(skb, sizeof(struct iphdr) + 4); iph->version = 4; iph->ihl = (sizeof(struct iphdr)+4)>>2; iph->tos = 0xc0; iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; ip_select_ident(net, skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; ((u8 *)&iph[1])[3] = 0; ih = skb_put(skb, sizeof(struct igmphdr)); ih->type = type; ih->code = 0; ih->csum = 0; ih->group = group; ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr)); return ip_local_out(net, skb->sk, skb); } static void igmp_gq_timer_expire(struct timer_list *t) { struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer); in_dev->mr_gq_running = 0; igmpv3_send_report(in_dev, NULL); in_dev_put(in_dev); } static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer); u32 mr_ifc_count; igmpv3_send_cr(in_dev); restart: mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count); if (mr_ifc_count) { if (cmpxchg(&in_dev->mr_ifc_count, mr_ifc_count, mr_ifc_count - 1) != mr_ifc_count) goto restart; igmp_ifc_start_timer(in_dev, unsolicited_report_interval(in_dev)); } in_dev_put(in_dev); } static void igmp_ifc_event(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv)); igmp_ifc_start_timer(in_dev, 1); } static void igmp_timer_expire(struct timer_list *t) { struct ip_mc_list *im = timer_container_of(im, t, timer); struct in_device *in_dev = im->interface; spin_lock(&im->lock); im->tm_running = 0; if (im->unsolicit_count && --im->unsolicit_count) igmp_start_timer(im, unsolicited_report_interval(in_dev)); im->reporter = 1; spin_unlock(&im->lock); if (IGMP_V1_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); else if (IGMP_V2_SEEN(in_dev)) igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); else igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); ip_ma_put(im); } /* mark EXCLUDE-mode sources */ static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) { /* skip inactive filters */ if (psf->sf_count[MCAST_INCLUDE] || pmc->sfcount[MCAST_EXCLUDE] != psf->sf_count[MCAST_EXCLUDE]) break; if (srcs[i] == psf->sf_inaddr) { scount++; break; } } } pmc->gsquery = 0; if (scount == nsrcs) /* all sources excluded */ return 0; return 1; } static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) { struct ip_sf_list *psf; int i, scount; if (pmc->sfmode == MCAST_EXCLUDE) return igmp_xmarksources(pmc, nsrcs, srcs); /* mark INCLUDE-mode sources */ scount = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (scount == nsrcs) break; for (i = 0; i < nsrcs; i++) if (srcs[i] == psf->sf_inaddr) { psf->sf_gsresp = 1; scount++; break; } } if (!scount) { pmc->gsquery = 0; return 0; } pmc->gsquery = 1; return 1; } /* return true if packet was dropped */ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; if (ipv4_is_local_multicast(group) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == group) { igmp_stop_timer(im); break; } } rcu_read_unlock(); return false; } /* return true if packet was dropped */ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, int len) { struct igmphdr *ih = igmp_hdr(skb); struct igmpv3_query *ih3 = igmpv3_query_hdr(skb); struct ip_mc_list *im; __be32 group = ih->group; int max_delay; int mark = 0; struct net *net = dev_net(in_dev->dev); if (len == 8) { if (ih->code == 0) { /* Alas, old v1 router presents here. */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_v1_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; group = 0; } else { /* v2 router present */ max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); in_dev->mr_v2_seen = jiffies + (in_dev->mr_qrv * in_dev->mr_qi) + in_dev->mr_qri; } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ igmpv3_clear_delrec(in_dev); } else if (len < 12) { return true; /* ignore bogus packet; freed by caller */ } else if (IGMP_V1_SEEN(in_dev)) { /* This is a v3 query with v1 queriers present */ max_delay = IGMP_QUERY_RESPONSE_INTERVAL; group = 0; } else if (IGMP_V2_SEEN(in_dev)) { /* this is a v3 query with v2 queriers present; * Interpretation of the max_delay code is problematic here. * A real v2 host would use ih_code directly, while v3 has a * different encoding. We use the v3 encoding as more likely * to be intended in a v3 query. */ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return true; ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs) { if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + ntohs(ih3->nsrcs)*sizeof(__be32))) return true; ih3 = igmpv3_query_hdr(skb); } max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ in_dev->mr_maxdelay = max_delay; /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently * received value was zero, use the default or statically * configured value. */ in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL; /* RFC3376, 8.3. Query Response Interval: * The number of seconds represented by the [Query Response * Interval] must be less than the [Query Interval]. */ if (in_dev->mr_qri >= in_dev->mr_qi) in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ; if (!group) { /* general query */ if (ih3->nsrcs) return true; /* no sources allowed */ igmp_gq_start_timer(in_dev); return false; } /* mark sources to include, if group & source-specific */ mark = ih3->nsrcs != 0; } /* * - Start the timers in all of our membership records * that the query applies to for the interface on * which the query arrived excl. those that belong * to a "local" group (224.0.0.X) * - For timers already running check if they need to * be reset. * - Use the igmp->igmp_code field as the maximum * delay possible */ rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { int changed; if (group && group != im->multiaddr) continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; else im->gsquery = mark; changed = !im->gsquery || igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs); spin_unlock_bh(&im->lock); if (changed) igmp_mod_timer(im, max_delay); } rcu_read_unlock(); return false; } /* called in rcu_read_lock() section */ int igmp_rcv(struct sk_buff *skb) { /* This basically follows the spec line by line -- see RFC1112 */ struct igmphdr *ih; struct net_device *dev = skb->dev; struct in_device *in_dev; int len = skb->len; bool dropped = true; if (netif_is_l3_master(dev)) { dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif); if (!dev) goto drop; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto drop; if (!pskb_may_pull(skb, sizeof(struct igmphdr))) goto drop; if (skb_checksum_simple_validate(skb)) goto drop; ih = igmp_hdr(skb); switch (ih->type) { case IGMP_HOST_MEMBERSHIP_QUERY: dropped = igmp_heard_query(in_dev, skb, len); break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ if (rt_is_output_route(skb_rtable(skb))) break; /* don't rely on MC router hearing unicast reports */ if (skb->pkt_type == PACKET_MULTICAST || skb->pkt_type == PACKET_BROADCAST) dropped = igmp_heard_report(in_dev, ih->group); break; case IGMP_PIM: #ifdef CONFIG_IP_PIMSM_V1 return pim_rcv_v1(skb); #endif case IGMPV3_HOST_MEMBERSHIP_REPORT: case IGMP_DVMRP: case IGMP_TRACE: case IGMP_HOST_LEAVE_MESSAGE: case IGMP_MTRACE: case IGMP_MTRACE_RESP: break; default: break; } drop: if (dropped) kfree_skb(skb); else consume_skb(skb); return 0; } #endif /* * Add a filter to a device */ static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. We will get multicast token leakage, when IFF_MULTICAST is changed. This check should be done in ndo_set_rx_mode routine. Something sort of: if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } --ANK */ if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_add(dev, buf); } /* * Remove a filter from a device */ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) { char buf[MAX_ADDR_LEN]; struct net_device *dev = in_dev->dev; if (arp_mc_map(addr, buf, dev, 0) == 0) dev_mc_del(dev, buf); } #ifdef CONFIG_IP_MULTICAST /* * deleted ip_mc_list manipulation */ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im, gfp_t gfp) { struct ip_mc_list *pmc; struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not * used for management of the delete list. Using the same structure * for deleted items allows change reports to use common code with * non-deleted or query-response MCA's. */ pmc = kzalloc(sizeof(*pmc), gfp); if (!pmc) return; spin_lock_init(&pmc->lock); spin_lock_bh(&im->lock); pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; pmc->tomb = im->tomb; pmc->sources = im->sources; im->tomb = im->sources = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = pmc->crcount; } spin_unlock_bh(&im->lock); spin_lock_bh(&in_dev->mc_tomb_lock); pmc->next = in_dev->mc_tomb; in_dev->mc_tomb = pmc; spin_unlock_bh(&in_dev->mc_tomb_lock); } /* * restore ip_mc_list deleted records */ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc, *pmc_prev; struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); __be32 multiaddr = im->multiaddr; spin_lock_bh(&in_dev->mc_tomb_lock); pmc_prev = NULL; for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) { if (pmc->multiaddr == multiaddr) break; pmc_prev = pmc; } if (pmc) { if (pmc_prev) pmc_prev->next = pmc->next; else in_dev->mc_tomb = pmc->next; } spin_unlock_bh(&in_dev->mc_tomb_lock); spin_lock_bh(&im->lock); if (pmc) { im->interface = pmc->interface; if (im->sfmode == MCAST_INCLUDE) { swap(im->tomb, pmc->tomb); swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } else { im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); } in_dev_put(pmc->interface); kfree_pmc(pmc); } spin_unlock_bh(&im->lock); } /* * flush ip_mc_list deleted records */ static void igmpv3_clear_delrec(struct in_device *in_dev) { struct ip_mc_list *pmc, *nextpmc; spin_lock_bh(&in_dev->mc_tomb_lock); pmc = in_dev->mc_tomb; in_dev->mc_tomb = NULL; spin_unlock_bh(&in_dev->mc_tomb_lock); for (; pmc; pmc = nextpmc) { nextpmc = pmc->next; ip_mc_clear_src(pmc); in_dev_put(pmc->interface); kfree_pmc(pmc); } /* clear dead sources, too */ rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { struct ip_sf_list *psf; spin_lock_bh(&pmc->lock); psf = pmc->tomb; pmc->tomb = NULL; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(psf); } rcu_read_unlock(); } #endif static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); int reporter; #endif if (im->loaded) { im->loaded = 0; ip_mc_filter_del(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; reporter = im->reporter; igmp_stop_timer(im); if (!in_dev->dead) { if (IGMP_V1_SEEN(in_dev)) return; if (IGMP_V2_SEEN(in_dev)) { if (reporter) igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); return; } /* IGMPv3 */ igmpv3_add_delrec(in_dev, im, gfp); igmp_ifc_event(in_dev); } #endif } static void igmp_group_dropped(struct ip_mc_list *im) { __igmp_group_dropped(im, GFP_KERNEL); } static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST struct net *net = dev_net(in_dev->dev); #endif if (im->loaded == 0) { im->loaded = 1; ip_mc_filter_add(in_dev, im->multiaddr); } #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) return; if (in_dev->dead) return; im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { spin_lock_bh(&im->lock); igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY); spin_unlock_bh(&im->lock); return; } /* else, v3 */ /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should * not send filter-mode change record as the mode should be from * IN() to IN(A). */ if (im->sfmode == MCAST_EXCLUDE) im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); igmp_ifc_event(in_dev); #endif } /* * Multicast list managers */ static u32 ip_mc_hash(const struct ip_mc_list *im) { return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG); } static void ip_mc_hash_add(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash; u32 hash; mc_hash = rtnl_dereference(in_dev->mc_hash); if (mc_hash) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; rcu_assign_pointer(mc_hash[hash], im); return; } /* do not use a hash table for small number of items */ if (in_dev->mc_count < 4) return; mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG, GFP_KERNEL); if (!mc_hash) return; for_each_pmc_rtnl(in_dev, im) { hash = ip_mc_hash(im); im->next_hash = mc_hash[hash]; RCU_INIT_POINTER(mc_hash[hash], im); } rcu_assign_pointer(in_dev->mc_hash, mc_hash); } static void ip_mc_hash_remove(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash); struct ip_mc_list *aux; if (!mc_hash) return; mc_hash += ip_mc_hash(im); while ((aux = rtnl_dereference(*mc_hash)) != im) mc_hash = &aux->next_hash; *mc_hash = im->next_hash; } int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, const struct ip_mc_list *im, struct inet_fill_args *args) { struct ifa_cacheinfo ci; struct ifaddrmsg *ifm; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = 32; ifm->ifa_flags = IFA_F_PERMANENT; ifm->ifa_scope = RT_SCOPE_UNIVERSE; ifm->ifa_index = dev->ifindex; ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ; ci.tstamp = ci.cstamp; ci.ifa_prefered = INFINITY_LIFE_TIME; ci.ifa_valid = INFINITY_LIFE_TIME; if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 || nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static void inet_ifmcaddr_notify(struct net_device *dev, const struct ip_mc_list *im, int event) { struct inet_fill_args fillargs = { .event = event, }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(sizeof(__be32)) + nla_total_size(sizeof(struct ifa_cacheinfo)), GFP_KERNEL); if (!skb) goto error; err = inet_fill_ifmcaddr(skb, dev, im, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); goto error; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL); return; error: rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err); } /* * A socket has joined a multicast group on device dev. */ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, unsigned int mode, gfp_t gfp) { struct ip_mc_list __rcu **mc_hash; struct ip_mc_list *im; ASSERT_RTNL(); mc_hash = rtnl_dereference(in_dev->mc_hash); if (mc_hash) { u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG); for (im = rtnl_dereference(mc_hash[hash]); im; im = rtnl_dereference(im->next_hash)) { if (im->multiaddr == addr) break; } } else { for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == addr) break; } } if (im) { im->users++; ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0); goto out; } im = kzalloc(sizeof(*im), gfp); if (!im) goto out; im->users = 1; im->interface = in_dev; in_dev_hold(in_dev); im->multiaddr = addr; im->mca_cstamp = jiffies; im->mca_tstamp = im->mca_cstamp; /* initial mode is (EX, empty) */ im->sfmode = mode; im->sfcount[mode] = 1; refcount_set(&im->refcnt, 1); spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST timer_setup(&im->timer, igmp_timer_expire, 0); #endif im->next_rcu = in_dev->mc_list; in_dev->mc_count++; rcu_assign_pointer(in_dev->mc_list, im); ip_mc_hash_add(in_dev, im); #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: return; } void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp); } EXPORT_SYMBOL(__ip_mc_inc_group); void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { __ip_mc_inc_group(in_dev, addr, GFP_KERNEL); } EXPORT_SYMBOL(ip_mc_inc_group); static int ip_mc_check_iphdr(struct sk_buff *skb) { const struct iphdr *iph; unsigned int len; unsigned int offset = skb_network_offset(skb) + sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph)) return -EINVAL; offset += ip_hdrlen(skb) - sizeof(*iph); if (!pskb_may_pull(skb, offset)) return -EINVAL; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) return -EINVAL; len = skb_network_offset(skb) + ntohs(iph->tot_len); if (skb->len < len || len < offset) return -EINVAL; skb_set_transport_header(skb, offset); return 0; } static int ip_mc_check_igmp_reportv3(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb); len += sizeof(struct igmpv3_report); return ip_mc_may_pull(skb, len) ? 0 : -EINVAL; } static int ip_mc_check_igmp_query(struct sk_buff *skb) { unsigned int transport_len = ip_transport_len(skb); unsigned int len; /* IGMPv{1,2}? */ if (transport_len != sizeof(struct igmphdr)) { /* or IGMPv3? */ if (transport_len < sizeof(struct igmpv3_query)) return -EINVAL; len = skb_transport_offset(skb) + sizeof(struct igmpv3_query); if (!ip_mc_may_pull(skb, len)) return -EINVAL; } /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer * all-systems destination addresses (224.0.0.1) for general queries */ if (!igmp_hdr(skb)->group && ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP)) return -EINVAL; return 0; } static int ip_mc_check_igmp_msg(struct sk_buff *skb) { switch (igmp_hdr(skb)->type) { case IGMP_HOST_LEAVE_MESSAGE: case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: return 0; case IGMPV3_HOST_MEMBERSHIP_REPORT: return ip_mc_check_igmp_reportv3(skb); case IGMP_HOST_MEMBERSHIP_QUERY: return ip_mc_check_igmp_query(skb); default: return -ENOMSG; } } static __sum16 ip_mc_validate_checksum(struct sk_buff *skb) { return skb_checksum_simple_validate(skb); } static int ip_mc_check_igmp_csum(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr); unsigned int transport_len = ip_transport_len(skb); struct sk_buff *skb_chk; if (!ip_mc_may_pull(skb, len)) return -EINVAL; skb_chk = skb_checksum_trimmed(skb, transport_len, ip_mc_validate_checksum); if (!skb_chk) return -EINVAL; if (skb_chk != skb) kfree_skb(skb_chk); return 0; } /** * ip_mc_check_igmp - checks whether this is a sane IGMP packet * @skb: the skb to validate * * Checks whether an IPv4 packet is a valid IGMP packet. If so sets * skb transport header accordingly and returns zero. * * -EINVAL: A broken packet was detected, i.e. it violates some internet * standard * -ENOMSG: IP header validation succeeded but it is not an IGMP packet. * -ENOMEM: A memory allocation failure happened. * * Caller needs to set the skb network header and free any returned skb if it * differs from the provided skb. */ int ip_mc_check_igmp(struct sk_buff *skb) { int ret = ip_mc_check_iphdr(skb); if (ret < 0) return ret; if (ip_hdr(skb)->protocol != IPPROTO_IGMP) return -ENOMSG; ret = ip_mc_check_igmp_csum(skb); if (ret < 0) return ret; return ip_mc_check_igmp_msg(skb); } EXPORT_SYMBOL(ip_mc_check_igmp); /* * Resend IGMP JOIN report; used by netdev notifier. */ static void ip_mc_rejoin_groups(struct in_device *in_dev) { #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports)) continue; /* a failover is happening and switches * must be notified immediately */ if (IGMP_V1_SEEN(in_dev)) type = IGMP_HOST_MEMBERSHIP_REPORT; else if (IGMP_V2_SEEN(in_dev)) type = IGMPV2_HOST_MEMBERSHIP_REPORT; else type = IGMPV3_HOST_MEMBERSHIP_REPORT; igmp_send_report(in_dev, im, type); } #endif } /* * A socket has left a multicast group on device dev */ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) { struct ip_mc_list *i; struct ip_mc_list __rcu **ip; ASSERT_RTNL(); for (ip = &in_dev->mc_list; (i = rtnl_dereference(*ip)) != NULL; ip = &i->next_rcu) { if (i->multiaddr == addr) { if (--i->users == 0) { ip_mc_hash_remove(in_dev, i); *ip = i->next_rcu; in_dev->mc_count--; __igmp_group_dropped(i, gfp); inet_ifmcaddr_notify(in_dev->dev, i, RTM_DELMULTICAST); ip_mc_clear_src(i); if (!in_dev->dead) ip_rt_multicast_event(in_dev); ip_ma_put(i); return; } break; } } } EXPORT_SYMBOL(__ip_mc_dec_group); /* Device changing type */ void ip_mc_unmap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); } void ip_mc_remap(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* Device going down */ void ip_mc_down(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); for_each_pmc_rtnl(in_dev, pmc) igmp_group_dropped(pmc); #ifdef CONFIG_IP_MULTICAST WRITE_ONCE(in_dev->mr_ifc_count, 0); if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; if (timer_delete(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); #endif ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); } #ifdef CONFIG_IP_MULTICAST static void ip_mc_reset(struct in_device *in_dev) { struct net *net = dev_net(in_dev->dev); in_dev->mr_qi = IGMP_QUERY_INTERVAL; in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL; in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv); } #else static void ip_mc_reset(struct in_device *in_dev) { } #endif void ip_mc_init_dev(struct in_device *in_dev) { ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0); timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0); #endif ip_mc_reset(in_dev); spin_lock_init(&in_dev->mc_tomb_lock); } /* Device going up */ void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; ASSERT_RTNL(); ip_mc_reset(in_dev); ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); for_each_pmc_rtnl(in_dev, pmc) { #ifdef CONFIG_IP_MULTICAST igmpv3_del_delrec(in_dev, pmc); #endif igmp_group_added(pmc); } } /* * Device is about to be destroyed: clean up. */ void ip_mc_destroy_dev(struct in_device *in_dev) { struct ip_mc_list *i; ASSERT_RTNL(); /* Deactivate timers */ ip_mc_down(in_dev); #ifdef CONFIG_IP_MULTICAST igmpv3_clear_delrec(in_dev); #endif while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; ip_mc_clear_src(i); ip_ma_put(i); } } /* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { struct net_device *dev = NULL; struct in_device *idev = NULL; if (imr->imr_ifindex) { idev = inetdev_by_index(net, imr->imr_ifindex); return idev; } if (imr->imr_address.s_addr) { dev = __ip_dev_find(net, imr->imr_address.s_addr, false); if (!dev) return NULL; } if (!dev) { struct rtable *rt = ip_route_output(net, imr->imr_multiaddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE); if (!IS_ERR(rt)) { dev = rt->dst.dev; ip_rt_put(rt); } } if (dev) { imr->imr_ifindex = dev->ifindex; idev = __in_dev_get_rtnl(dev); } return idev; } /* * Join a socket to a group */ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; int rv = 0; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf || psf->sf_count[sfmode] == 0) { /* source filter not found, or count wrong => bug */ return -ESRCH; } psf->sf_count[sfmode]--; if (psf->sf_count[sfmode] == 0) { ip_rt_multicast_event(pmc->interface); } if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ if (psf_prev) psf_prev->sf_next = psf->sf_next; else pmc->sources = psf->sf_next; #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; } else #endif kfree(psf); } return rv; } #ifndef CONFIG_IP_MULTICAST #define igmp_ifc_event(x) do { } while (0) #endif static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int changerec = 0; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif if (!delta) { err = -EINVAL; if (!pmc->sfcount[sfmode]) goto out_unlock; pmc->sfcount[sfmode]--; } err = 0; for (i = 0; i < sfcount; i++) { int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]); changerec |= rv > 0; if (!err && rv < 0) err = rv; } if (pmc->sfmode == MCAST_EXCLUDE && pmc->sfcount[MCAST_EXCLUDE] == 0 && pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); } else if (sf_setstate(pmc) || changerec) { igmp_ifc_event(pmc->interface); #endif } out_unlock: spin_unlock_bh(&pmc->lock); return err; } /* * Add multicast single-source filter to the interface list */ static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) { struct ip_sf_list *psf, *psf_prev; psf_prev = NULL; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == *psfsrc) break; psf_prev = psf; } if (!psf) { psf = kzalloc(sizeof(*psf), GFP_ATOMIC); if (!psf) return -ENOBUFS; psf->sf_inaddr = *psfsrc; if (psf_prev) { psf_prev->sf_next = psf; } else pmc->sources = psf; } psf->sf_count[sfmode]++; if (psf->sf_count[sfmode] == 1) { ip_rt_multicast_event(pmc->interface); } return 0; } #ifdef CONFIG_IP_MULTICAST static void sf_markstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; for (psf = pmc->sources; psf; psf = psf->sf_next) if (pmc->sfcount[MCAST_EXCLUDE]) { psf->sf_oldin = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; } static int sf_setstate(struct ip_mc_list *pmc) { struct ip_sf_list *psf, *dpsf; int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; int qrv = pmc->interface->mr_qrv; int new_in, rv; rv = 0; for (psf = pmc->sources; psf; psf = psf->sf_next) { if (pmc->sfcount[MCAST_EXCLUDE]) { new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && !psf->sf_count[MCAST_INCLUDE]; } else new_in = psf->sf_count[MCAST_INCLUDE] != 0; if (new_in) { if (!psf->sf_oldin) { struct ip_sf_list *prev = NULL; for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) { if (dpsf->sf_inaddr == psf->sf_inaddr) break; prev = dpsf; } if (dpsf) { if (prev) prev->sf_next = dpsf->sf_next; else pmc->tomb = dpsf->sf_next; kfree(dpsf); } psf->sf_crcount = qrv; rv++; } } else if (psf->sf_oldin) { psf->sf_crcount = 0; /* * add or update "delete" records if an active filter * is now inactive */ for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) if (dpsf->sf_inaddr == psf->sf_inaddr) break; if (!dpsf) { dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC); if (!dpsf) continue; *dpsf = *psf; /* pmc->lock held by callers */ dpsf->sf_next = pmc->tomb; pmc->tomb = dpsf; } dpsf->sf_crcount = qrv; rv++; } } return rv; } #endif /* * Add multicast source filter list to the interface list */ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, int sfcount, __be32 *psfsrc, int delta) { struct ip_mc_list *pmc; int isexclude; int i, err; if (!in_dev) return -ENODEV; rcu_read_lock(); for_each_pmc_rcu(in_dev, pmc) { if (*pmca == pmc->multiaddr) break; } if (!pmc) { /* MCA not found?? bug */ rcu_read_unlock(); return -ESRCH; } spin_lock_bh(&pmc->lock); rcu_read_unlock(); #ifdef CONFIG_IP_MULTICAST sf_markstate(pmc); #endif isexclude = pmc->sfmode == MCAST_EXCLUDE; if (!delta) pmc->sfcount[sfmode]++; err = 0; for (i = 0; i < sfcount; i++) { err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]); if (err) break; } if (err) { int j; if (!delta) pmc->sfcount[sfmode]--; for (j = 0; j < i; j++) (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif /* filter mode change */ if (pmc->sfcount[MCAST_EXCLUDE]) pmc->sfmode = MCAST_EXCLUDE; else if (pmc->sfcount[MCAST_INCLUDE]) pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv); WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); } else if (sf_setstate(pmc)) { igmp_ifc_event(in_dev); #endif } spin_unlock_bh(&pmc->lock); return err; } static void ip_mc_clear_src(struct ip_mc_list *pmc) { struct ip_sf_list *tomb, *sources; spin_lock_bh(&pmc->lock); tomb = pmc->tomb; pmc->tomb = NULL; sources = pmc->sources; pmc->sources = NULL; pmc->sfmode = MCAST_EXCLUDE; pmc->sfcount[MCAST_INCLUDE] = 0; pmc->sfcount[MCAST_EXCLUDE] = 1; spin_unlock_bh(&pmc->lock); ip_sf_list_clear_all(tomb); ip_sf_list_clear_all(sources); } /* Join a multicast group */ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { __be32 addr = imr->imr_multiaddr.s_addr; struct ip_mc_socklist *iml, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int ifindex; int count = 0; int err; ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRINUSE; ifindex = imr->imr_ifindex; for_each_pmc_rtnl(inet, i) { if (i->multi.imr_multiaddr.s_addr == addr && i->multi.imr_ifindex == ifindex) goto done; count++; } err = -ENOBUFS; if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships)) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) goto done; memcpy(&iml->multi, imr, sizeof(*imr)); iml->next_rcu = inet->mc_list; iml->sflist = NULL; iml->sfmode = mode; rcu_assign_pointer(inet->mc_list, iml); ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL); err = 0; done: return err; } /* Join ASM (Any-Source Multicast) group */ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE); } EXPORT_SYMBOL(ip_mc_join_group); /* Join SSM (Source-Specific Multicast) group */ int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode) { return __ip_mc_join_group(sk, imr, mode); } static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist); int err; if (!psf) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, psf->sl_count, psf->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc); kfree_rcu(psf, rcu); return err; } int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct ip_mc_socklist __rcu **imlp; struct in_device *in_dev; struct net *net = sock_net(sk); __be32 group = imr->imr_multiaddr.s_addr; u32 ifindex; int ret = -EADDRNOTAVAIL; ASSERT_RTNL(); in_dev = ip_mc_find_dev(net, imr); if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) { ret = -ENODEV; goto out; } ifindex = imr->imr_ifindex; for (imlp = &inet->mc_list; (iml = rtnl_dereference(*imlp)) != NULL; imlp = &iml->next_rcu) { if (iml->multi.imr_multiaddr.s_addr != group) continue; if (ifindex) { if (iml->multi.imr_ifindex != ifindex) continue; } else if (imr->imr_address.s_addr && imr->imr_address.s_addr != iml->multi.imr_address.s_addr) continue; (void) ip_mc_leave_src(sk, iml, in_dev); *imlp = iml->next_rcu; if (in_dev) ip_mc_dec_group(in_dev, group); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } out: return ret; } EXPORT_SYMBOL(ip_mc_leave_group); int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex) { int err; struct ip_mreqn imr; __be32 addr = mreqs->imr_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev = NULL; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); int leavegroup = 0; int i, j, rv; if (!ipv4_is_multicast(addr)) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if ((pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr) && (pmc->multi.imr_ifindex == imr.imr_ifindex)) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } /* if a source filter was set, must be the same mode as before */ if (pmc->sflist) { if (pmc->sfmode != omode) { err = -EINVAL; goto done; } } else if (pmc->sfmode != omode) { /* allow mode switches for empty-set filters */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, NULL, 0); pmc->sfmode = omode; } psl = rtnl_dereference(pmc->sflist); if (!add) { if (!psl) goto done; /* err = -EADDRNOTAVAIL */ rv = !0; for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv) /* source not found */ goto done; /* err = -EADDRNOTAVAIL */ /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (psl->sl_count == 1 && omode == MCAST_INCLUDE) { leavegroup = 1; goto done; } /* update the interface filter */ ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); for (j = i+1; j < psl->sl_count; j++) psl->sl_addr[j-1] = psl->sl_addr[j]; psl->sl_count--; err = 0; goto done; } /* else, add a new source to the filter */ if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) { err = -ENOBUFS; goto done; } if (!psl || psl->sl_count == psl->sl_max) { struct ip_sf_socklist *newpsl; int count = IP_SFBLOCK; if (psl) count += psl->sl_max; newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = count; newpsl->sl_count = count - IP_SFBLOCK; if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i = 0; i < psl->sl_count; i++) { rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, sizeof(__be32)); if (rv == 0) break; } if (rv == 0) /* address already there is an error */ goto done; for (j = psl->sl_count-1; j >= i; j--) psl->sl_addr[j+1] = psl->sl_addr[j]; psl->sl_addr[i] = mreqs->imr_sourceaddr; psl->sl_count++; err = 0; /* update the interface list */ ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) { int err = 0; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *newpsl, *psl; struct net *net = sock_net(sk); int leavegroup = 0; if (!ipv4_is_multicast(addr)) return -EINVAL; if (msf->imsf_fmode != MCAST_INCLUDE && msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; ASSERT_RTNL(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = ifindex; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } /* special case - (INCLUDE, empty) == LEAVE_GROUP */ if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) { leavegroup = 1; goto done; } for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) { /* must have a prior join */ err = -EINVAL; goto done; } if (msf->imsf_numsrc) { newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, msf->imsf_numsrc), GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; memcpy(newpsl->sl_addr, msf->imsf_slist_flex, flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc)); err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, newpsl->sl_max)); goto done; } } else { newpsl = NULL; (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, 0, NULL, 0); } psl = rtnl_dereference(pmc->sflist); if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ atomic_sub(struct_size(psl, sl_addr, psl->sl_max), &sk->sk_omem_alloc); } else { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); } rcu_assign_pointer(pmc->sflist, newpsl); if (psl) kfree_rcu(psl, rcu); pmc->sfmode = msf->imsf_fmode; err = 0; done: if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; } int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen) { int err, len, count, copycount, msf_size; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; struct net *net = sock_net(sk); ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; in_dev = ip_mc_find_dev(net, &imr); if (!in_dev) { err = -ENODEV; goto done; } err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && pmc->multi.imr_ifindex == imr.imr_ifindex) break; } if (!pmc) /* must have a prior join */ goto done; msf->imsf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); if (!psl) { count = 0; } else { count = psl->sl_count; } copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; msf_size = IP_MSFILTER_SIZE(copycount); if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) || copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && copy_to_sockptr_offset(optval, offsetof(struct ip_msfilter, imsf_slist_flex), psl->sl_addr, len)) return -EFAULT; return 0; done: return err; } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t ss_offset) { int i, count, copycount; struct sockaddr_in *psin; __be32 addr; struct ip_mc_socklist *pmc; struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; ASSERT_RTNL(); psin = (struct sockaddr_in *)&gsf->gf_group; if (psin->sin_family != AF_INET) return -EINVAL; addr = psin->sin_addr.s_addr; if (!ipv4_is_multicast(addr)) return -EINVAL; for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == addr && pmc->multi.imr_ifindex == gsf->gf_interface) break; } if (!pmc) /* must have a prior join */ return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; ss_offset += sizeof(ss); } return 0; } /* * check if a multicast source filter allows delivery for a given <src,dst,intf> */ int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif, int sdif) { const struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; int ret; ret = 1; if (!ipv4_is_multicast(loc_addr)) goto out; rcu_read_lock(); for_each_pmc_rcu(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && (pmc->multi.imr_ifindex == dif || (sdif && pmc->multi.imr_ifindex == sdif))) break; } ret = inet_test_bit(MC_ALL, sk); if (!pmc) goto unlock; psl = rcu_dereference(pmc->sflist); ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) goto unlock; for (i = 0; i < psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) goto unlock; ret = 1; unlock: rcu_read_unlock(); out: return ret; } /* * A socket is closing. */ void ip_mc_drop_socket(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ip_mc_socklist *iml; struct net *net = sock_net(sk); if (!inet->mc_list) return; rtnl_lock(); while ((iml = rtnl_dereference(inet->mc_list)) != NULL) { struct in_device *in_dev; inet->mc_list = iml->next_rcu; in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); if (in_dev) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); } rtnl_unlock(); } /* called with rcu_read_lock() */ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto) { struct ip_mc_list *im; struct ip_mc_list __rcu **mc_hash; struct ip_sf_list *psf; int rv = 0; mc_hash = rcu_dereference(in_dev->mc_hash); if (mc_hash) { u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG); for (im = rcu_dereference(mc_hash[hash]); im != NULL; im = rcu_dereference(im->next_hash)) { if (im->multiaddr == mc_addr) break; } } else { for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == mc_addr) break; } } if (im && proto == IPPROTO_IGMP) { rv = 1; } else if (im) { if (src_addr) { spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; } if (psf) rv = psf->sf_count[MCAST_INCLUDE] || psf->sf_count[MCAST_EXCLUDE] != im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } return rv; } #if defined(CONFIG_PROC_FS) struct igmp_mc_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *in_dev; }; #define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private) static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_mc_list *im = NULL; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(state->dev); if (!in_dev) continue; im = rcu_dereference(in_dev->mc_list); if (im) { state->in_dev = in_dev; break; } } return im; } static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); im = rcu_dereference(im->next_rcu); while (!im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->in_dev = NULL; break; } state->in_dev = __in_dev_get_rcu(state->dev); if (!state->in_dev) continue; im = rcu_dereference(state->in_dev->mc_list); } return im; } static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) { struct ip_mc_list *im = igmp_mc_get_first(seq); if (im) while (pos && (im = igmp_mc_get_next(seq, im)) != NULL) --pos; return pos ? NULL : im; } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_mc_list *im; if (v == SEQ_START_TOKEN) im = igmp_mc_get_first(seq); else im = igmp_mc_get_next(seq, v); ++*pos; return im; } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mc_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_puts(seq, "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); else { struct ip_mc_list *im = v; struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); char *querier; long delta; #ifdef CONFIG_IP_MULTICAST querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : IGMP_V2_SEEN(state->in_dev) ? "V2" : "V3"; #else querier = "NONE"; #endif if (rcu_access_pointer(state->in_dev->mc_list) == im) { seq_printf(seq, "%d\t%-10s: %5d %7s\n", state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier); } delta = im->timer.expires - jiffies; seq_printf(seq, "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_delta_to_clock_t(delta) : 0, im->reporter); } return 0; } static const struct seq_operations igmp_mc_seq_ops = { .start = igmp_mc_seq_start, .next = igmp_mc_seq_next, .stop = igmp_mc_seq_stop, .show = igmp_mc_seq_show, }; struct igmp_mcf_iter_state { struct seq_net_private p; struct net_device *dev; struct in_device *idev; struct ip_mc_list *im; }; #define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private) static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct ip_sf_list *psf = NULL; struct ip_mc_list *im = NULL; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); state->idev = NULL; state->im = NULL; for_each_netdev_rcu(net, state->dev) { struct in_device *idev; idev = __in_dev_get_rcu(state->dev); if (unlikely(!idev)) continue; im = rcu_dereference(idev->mc_list); if (likely(im)) { spin_lock_bh(&im->lock); psf = im->sources; if (likely(psf)) { state->im = im; state->idev = idev; break; } spin_unlock_bh(&im->lock); } } return psf; } static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); psf = psf->sf_next; while (!psf) { spin_unlock_bh(&state->im->lock); state->im = state->im->next; while (!state->im) { state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } state->idev = __in_dev_get_rcu(state->dev); if (!state->idev) continue; state->im = rcu_dereference(state->idev->mc_list); } spin_lock_bh(&state->im->lock); psf = state->im->sources; } out: return psf; } static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) { struct ip_sf_list *psf = igmp_mcf_get_first(seq); if (psf) while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL) --pos; return pos ? NULL : psf; } static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_sf_list *psf; if (v == SEQ_START_TOKEN) psf = igmp_mcf_get_first(seq); else psf = igmp_mcf_get_next(seq, v); ++*pos; return psf; } static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (likely(state->im)) { spin_unlock_bh(&state->im->lock); state->im = NULL; } state->idev = NULL; state->dev = NULL; rcu_read_unlock(); } static int igmp_mcf_seq_show(struct seq_file *seq, void *v) { struct ip_sf_list *psf = v; struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (v == SEQ_START_TOKEN) { seq_puts(seq, "Idx Device MCA SRC INC EXC\n"); } else { seq_printf(seq, "%3d %6.6s 0x%08x " "0x%08x %6lu %6lu\n", state->dev->ifindex, state->dev->name, ntohl(state->im->multiaddr), ntohl(psf->sf_inaddr), psf->sf_count[MCAST_INCLUDE], psf->sf_count[MCAST_EXCLUDE]); } return 0; } static const struct seq_operations igmp_mcf_seq_ops = { .start = igmp_mcf_seq_start, .next = igmp_mcf_seq_next, .stop = igmp_mcf_seq_stop, .show = igmp_mcf_seq_show, }; static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; int err; pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops, sizeof(struct igmp_mc_iter_state)); if (!pde) goto out_igmp; pde = proc_create_net("mcfilter", 0444, net->proc_net, &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state)); if (!pde) goto out_mcfilter; err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, SOCK_DGRAM, 0, net); if (err < 0) { pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", err); goto out_sock; } return 0; out_sock: remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: return -ENOMEM; } static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { .init = igmp_net_init, .exit = igmp_net_exit, }; #endif static int igmp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev; switch (event) { case NETDEV_RESEND_IGMP: in_dev = __in_dev_get_rtnl(dev); if (in_dev) ip_mc_rejoin_groups(in_dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block igmp_notifier = { .notifier_call = igmp_netdev_event, }; int __init igmp_mc_init(void) { #if defined(CONFIG_PROC_FS) int err; err = register_pernet_subsys(&igmp_net_ops); if (err) return err; err = register_netdevice_notifier(&igmp_notifier); if (err) goto reg_notif_fail; return 0; reg_notif_fail: unregister_pernet_subsys(&igmp_net_ops); return err; #else return register_netdevice_notifier(&igmp_notifier); #endif }
14 14 6 9 11 2 9 4 4 4 4 94 85 15 32 32 59 2 3 3 4 53 57 53 19 65 60 2 60 59 6 57 59 63 271 271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 // SPDX-License-Identifier: GPL-2.0-only /* * irqchip.c: Common API for in kernel interrupt controllers * Copyright (c) 2007, Intel Corporation. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * Copyright (c) 2013, Alexander Graf <agraf@suse.de> * * This file is derived from virt/kvm/irq_comm.c. * * Authors: * Yaozu (Eddie) Dong <Eddie.dong@intel.com> * Alexander Graf <agraf@suse.de> */ #include <linux/kvm_host.h> #include <linux/slab.h> #include <linux/srcu.h> #include <linux/export.h> #include <trace/events/kvm.h> int kvm_irq_map_gsi(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *entries, int gsi) { struct kvm_irq_routing_table *irq_rt; struct kvm_kernel_irq_routing_entry *e; int n = 0; irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu, lockdep_is_held(&kvm->irq_lock)); if (irq_rt && gsi < irq_rt->nr_rt_entries) { hlist_for_each_entry(e, &irq_rt->map[gsi], link) { entries[n] = *e; ++n; } } return n; } int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_routing_table *irq_rt; irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); return irq_rt->chip[irqchip][pin]; } int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) { struct kvm_kernel_irq_routing_entry route; if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID)) return -EINVAL; route.msi.address_lo = msi->address_lo; route.msi.address_hi = msi->address_hi; route.msi.data = msi->data; route.msi.flags = msi->flags; route.msi.devid = msi->devid; return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false); } /* * Return value: * < 0 Interrupt was ignored (masked or not delivered for other reasons) * = 0 Interrupt was coalesced (previous irq is still pending) * > 0 Number of CPUs interrupt was delivered to */ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS]; int ret = -1, i, idx; trace_kvm_set_irq(irq, level, irq_source_id); /* Not possible to detect if the guest uses the PIC or the * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ idx = srcu_read_lock(&kvm->irq_srcu); i = kvm_irq_map_gsi(kvm, irq_set, irq); srcu_read_unlock(&kvm->irq_srcu, idx); while (i--) { int r; r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, line_status); if (r < 0) continue; ret = r + ((ret < 0) ? 0 : ret); } return ret; } static void free_irq_routing_table(struct kvm_irq_routing_table *rt) { int i; if (!rt) return; for (i = 0; i < rt->nr_rt_entries; ++i) { struct kvm_kernel_irq_routing_entry *e; struct hlist_node *n; hlist_for_each_entry_safe(e, n, &rt->map[i], link) { hlist_del(&e->link); kfree(e); } } kfree(rt); } void kvm_free_irq_routing(struct kvm *kvm) { /* Called only during vm destruction. Nobody can use the pointer at this stage */ struct kvm_irq_routing_table *rt = rcu_access_pointer(kvm->irq_routing); free_irq_routing_table(rt); } static int setup_routing_entry(struct kvm *kvm, struct kvm_irq_routing_table *rt, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { struct kvm_kernel_irq_routing_entry *ei; int r; u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES); /* * Do not allow GSI to be mapped to the same irqchip more than once. * Allow only one to one mapping between GSI and non-irqchip routing. */ hlist_for_each_entry(ei, &rt->map[gsi], link) if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) return -EINVAL; e->gsi = gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) return r; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi; hlist_add_head(&e->link, &rt->map[e->gsi]); return 0; } void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm) { } bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm) { return true; } int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *ue, unsigned nr, unsigned flags) { struct kvm_irq_routing_table *new, *old; struct kvm_kernel_irq_routing_entry *e; u32 i, j, nr_rt_entries = 0; int r; for (i = 0; i < nr; ++i) { if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES) return -EINVAL; nr_rt_entries = max(nr_rt_entries, ue[i].gsi); } nr_rt_entries += 1; new = kzalloc(struct_size(new, map, nr_rt_entries), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->nr_rt_entries = nr_rt_entries; for (i = 0; i < KVM_NR_IRQCHIPS; i++) for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++) new->chip[i][j] = -1; for (i = 0; i < nr; ++i) { r = -ENOMEM; e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT); if (!e) goto out; r = -EINVAL; switch (ue->type) { case KVM_IRQ_ROUTING_MSI: if (ue->flags & ~KVM_MSI_VALID_DEVID) goto free_entry; break; default: if (ue->flags) goto free_entry; break; } r = setup_routing_entry(kvm, new, e, ue); if (r) goto free_entry; ++ue; } mutex_lock(&kvm->irq_lock); old = rcu_dereference_protected(kvm->irq_routing, 1); rcu_assign_pointer(kvm->irq_routing, new); kvm_irq_routing_update(kvm); kvm_arch_irq_routing_update(kvm); mutex_unlock(&kvm->irq_lock); kvm_arch_post_irq_routing_update(kvm); synchronize_srcu_expedited(&kvm->irq_srcu); new = old; r = 0; goto out; free_entry: kfree(e); out: free_irq_routing_table(new); return r; } /* * Allocate empty IRQ routing by default so that additional setup isn't needed * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing * is guaranteed to be non-NULL. */ int kvm_init_irq_routing(struct kvm *kvm) { struct kvm_irq_routing_table *new; int chip_size; new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT); if (!new) return -ENOMEM; new->nr_rt_entries = 1; chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS; memset(new->chip, -1, chip_size); RCU_INIT_POINTER(kvm->irq_routing, new); return 0; }
959 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2016 ARM Ltd. */ #ifndef __ASM_PGTABLE_PROT_H #define __ASM_PGTABLE_PROT_H #include <asm/memory.h> #include <asm/pgtable-hwdef.h> #include <linux/const.h> /* * Software defined PTE bits definition. */ #define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) #define PTE_DEVMAP (_AT(pteval_t, 1) << 57) /* * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be * interpreted according to the HW layout by SW but any attempted HW access to * the address will result in a fault. pte_present() returns true. */ #define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */ #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */ #define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */ #else #define PTE_UFFD_WP (_AT(pteval_t, 0)) #define PTE_SWP_UFFD_WP (_AT(pteval_t, 0)) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) #define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF) #define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC)) #define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL)) #define PROT_NORMAL_TAGGED (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PTE_WRITE | PMD_ATTRINDX(MT_NORMAL)) #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) #define _PAGE_KERNEL (PROT_NORMAL) #define _PAGE_KERNEL_RO ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) #define _PAGE_KERNEL_ROX ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) #define _PAGE_KERNEL_EXEC (PROT_NORMAL & ~PTE_PXN) #define _PAGE_KERNEL_EXEC_CONT ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) #define _PAGE_SHARED (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define _PAGE_SHARED_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE) #define _PAGE_READONLY (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define _PAGE_READONLY_EXEC (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) #define _PAGE_EXECONLY (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #ifndef __ASSEMBLY__ #include <asm/cpufeature.h> #include <asm/pgtable-types.h> #include <asm/rsi.h> extern bool arm64_use_ng_mappings; extern unsigned long prot_ns_shared; #define PROT_NS_SHARED (is_realm_world() ? prot_ns_shared : 0) #define PTE_MAYBE_NG (arm64_use_ng_mappings ? PTE_NG : 0) #define PMD_MAYBE_NG (arm64_use_ng_mappings ? PMD_SECT_NG : 0) #ifndef CONFIG_ARM64_LPA2 #define lpa2_is_enabled() false #define PTE_MAYBE_SHARED PTE_SHARED #define PMD_MAYBE_SHARED PMD_SECT_S #define PHYS_MASK_SHIFT (CONFIG_ARM64_PA_BITS) #else static inline bool __pure lpa2_is_enabled(void) { return read_tcr() & TCR_DS; } #define PTE_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PTE_SHARED) #define PMD_MAYBE_SHARED (lpa2_is_enabled() ? 0 : PMD_SECT_S) #define PHYS_MASK_SHIFT (lpa2_is_enabled() ? CONFIG_ARM64_PA_BITS : 48) #endif /* * Highest possible physical address supported. */ #define PHYS_MASK ((UL(1) << PHYS_MASK_SHIFT) - 1) /* * If we have userspace only BTI we don't want to mark kernel pages * guarded even if the system does support BTI. */ #define PTE_MAYBE_GP (system_supports_bti_kernel() ? PTE_GP : 0) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) #define PAGE_KERNEL_RO __pgprot(_PAGE_KERNEL_RO) #define PAGE_KERNEL_ROX __pgprot(_PAGE_KERNEL_ROX) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL_EXEC) #define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_KERNEL_EXEC_CONT) #define PAGE_S2_MEMATTR(attr, has_fwb) \ ({ \ u64 __val; \ if (has_fwb) \ __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ else \ __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \ __val; \ }) #define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) /* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */ #define PAGE_SHARED __pgprot(_PAGE_SHARED) #define PAGE_SHARED_EXEC __pgprot(_PAGE_SHARED_EXEC) #define PAGE_READONLY __pgprot(_PAGE_READONLY) #define PAGE_READONLY_EXEC __pgprot(_PAGE_READONLY_EXEC) #define PAGE_EXECONLY __pgprot(_PAGE_EXECONLY) #endif /* __ASSEMBLY__ */ #define pte_pi_index(pte) ( \ ((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \ ((pte & BIT(PTE_PI_IDX_2)) >> (PTE_PI_IDX_2 - 2)) | \ ((pte & BIT(PTE_PI_IDX_1)) >> (PTE_PI_IDX_1 - 1)) | \ ((pte & BIT(PTE_PI_IDX_0)) >> (PTE_PI_IDX_0 - 0))) /* * Page types used via Permission Indirection Extension (PIE). PIE uses * the USER, DBM, PXN and UXN bits to to generate an index which is used * to look up the actual permission in PIR_ELx and PIRE0_EL1. We define * combinations we use on non-PIE systems with the same encoding, for * convenience these are listed here as comments as are the unallocated * encodings. */ /* 0: PAGE_DEFAULT */ /* 1: PTE_USER */ /* 2: PTE_WRITE */ /* 3: PTE_WRITE | PTE_USER */ /* 4: PAGE_EXECONLY PTE_PXN */ /* 5: PAGE_READONLY_EXEC PTE_PXN | PTE_USER */ /* 6: PTE_PXN | PTE_WRITE */ /* 7: PAGE_SHARED_EXEC PTE_PXN | PTE_WRITE | PTE_USER */ /* 8: PAGE_KERNEL_ROX PTE_UXN */ /* 9: PAGE_GCS_RO PTE_UXN | PTE_USER */ /* a: PAGE_KERNEL_EXEC PTE_UXN | PTE_WRITE */ /* b: PAGE_GCS PTE_UXN | PTE_WRITE | PTE_USER */ /* c: PAGE_KERNEL_RO PTE_UXN | PTE_PXN */ /* d: PAGE_READONLY PTE_UXN | PTE_PXN | PTE_USER */ /* e: PAGE_KERNEL PTE_UXN | PTE_PXN | PTE_WRITE */ /* f: PAGE_SHARED PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */ #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) #define PAGE_GCS __pgprot(_PAGE_GCS) #define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) #define PIE_E0 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_X_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RWX_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW_O)) #define PIE_E1 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY), PIE_NONE_O) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED), PIE_RW) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX), PIE_RX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC), PIE_RWX) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO), PIE_R) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL), PIE_RW)) #endif /* __ASM_PGTABLE_PROT_H */
476 274 349 418 506 506 349 506 506 959 28 28 38 38 551 377 261 768 622 366 771 550 769 772 768 769 348 349 76 169 76 635 635 50 78 78 958 957 955 957 944 99 125 184 98 60 768 252 78 830 862 529 488 769 772 98 98 60 60 90 90 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_PGTABLE_H #define __ASM_PGTABLE_H #include <asm/bug.h> #include <asm/proc-fns.h> #include <asm/memory.h> #include <asm/mte.h> #include <asm/pgtable-hwdef.h> #include <asm/pgtable-prot.h> #include <asm/tlbflush.h> /* * VMALLOC range. * * VMALLOC_START: beginning of the kernel vmalloc space * VMALLOC_END: extends to the available space below vmemmap */ #define VMALLOC_START (MODULES_END) #if VA_BITS == VA_BITS_MIN #define VMALLOC_END (VMEMMAP_START - SZ_8M) #else #define VMEMMAP_UNUSED_NPAGES ((_PAGE_OFFSET(vabits_actual) - PAGE_OFFSET) >> PAGE_SHIFT) #define VMALLOC_END (VMEMMAP_START + VMEMMAP_UNUSED_NPAGES * sizeof(struct page) - SZ_8M) #endif #define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) #ifndef __ASSEMBLY__ #include <asm/cmpxchg.h> #include <asm/fixmap.h> #include <asm/por.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/sched.h> #include <linux/page_table_check.h> static inline void emit_pte_barriers(void) { /* * These barriers are emitted under certain conditions after a pte entry * was modified (see e.g. __set_pte_complete()). The dsb makes the store * visible to the table walker. The isb ensures that any previous * speculative "invalid translation" marker that is in the CPU's * pipeline gets cleared, so that any access to that address after * setting the pte to valid won't cause a spurious fault. If the thread * gets preempted after storing to the pgtable but before emitting these * barriers, __switch_to() emits a dsb which ensure the walker gets to * see the store. There is no guarantee of an isb being issued though. * This is safe because it will still get issued (albeit on a * potentially different CPU) when the thread starts running again, * before any access to the address. */ dsb(ishst); isb(); } static inline void queue_pte_barriers(void) { unsigned long flags; if (in_interrupt()) { emit_pte_barriers(); return; } flags = read_thread_flags(); if (flags & BIT(TIF_LAZY_MMU)) { /* Avoid the atomic op if already set. */ if (!(flags & BIT(TIF_LAZY_MMU_PENDING))) set_thread_flag(TIF_LAZY_MMU_PENDING); } else { emit_pte_barriers(); } } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { /* * lazy_mmu_mode is not supposed to permit nesting. But in practice this * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation * inside a lazy_mmu_mode section (such as zap_pte_range()) will change * permissions on the linear map with apply_to_page_range(), which * re-enters lazy_mmu_mode. So we tolerate nesting in our * implementation. The first call to arch_leave_lazy_mmu_mode() will * flush and clear the flag such that the remainder of the work in the * outer nest behaves as if outside of lazy mmu mode. This is safe and * keeps tracking simple. */ if (in_interrupt()) return; set_thread_flag(TIF_LAZY_MMU); } static inline void arch_flush_lazy_mmu_mode(void) { if (in_interrupt()) return; if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING)) emit_pte_barriers(); } static inline void arch_leave_lazy_mmu_mode(void) { if (in_interrupt()) return; arch_flush_lazy_mmu_mode(); clear_thread_flag(TIF_LAZY_MMU); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE /* Set stride and tlb_level in flush_*_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) \ __flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2) #define flush_pud_tlb_range(vma, addr, end) \ __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Outside of a few very special situations (e.g. hibernation), we always * use broadcast TLB invalidation instructions, therefore a spurious page * fault on one CPU which has been handled concurrently by another CPU * does not need to perform additional invalidation. */ #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0) /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) phys_to_page(__pa_symbol(empty_zero_page)) #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e)) #ifdef CONFIG_ARM64_PA_BITS_52 static inline phys_addr_t __pte_to_phys(pte_t pte) { pte_val(pte) &= ~PTE_MAYBE_SHARED; return (pte_val(pte) & PTE_ADDR_LOW) | ((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT); } static inline pteval_t __phys_to_pte_val(phys_addr_t phys) { return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK; } #else static inline phys_addr_t __pte_to_phys(pte_t pte) { return pte_val(pte) & PTE_ADDR_LOW; } static inline pteval_t __phys_to_pte_val(phys_addr_t phys) { return phys; } #endif #define pte_pfn(pte) (__pte_to_phys(pte) >> PAGE_SHIFT) #define pfn_pte(pfn,prot) \ __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pte_none(pte) (!pte_val(pte)) #define __pte_clear(mm, addr, ptep) \ __set_pte(ptep, __pte(0)) #define pte_page(pte) (pfn_to_page(pte_pfn(pte))) /* * The following only work if pte_present(). Undefined behaviour otherwise. */ #define pte_present(pte) (pte_valid(pte) || pte_present_invalid(pte)) #define pte_young(pte) (!!(pte_val(pte) & PTE_AF)) #define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL)) #define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE)) #define pte_rdonly(pte) (!!(pte_val(pte) & PTE_RDONLY)) #define pte_user(pte) (!!(pte_val(pte) & PTE_USER)) #define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN)) #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT)) #define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP)) #define pte_tagged(pte) ((pte_val(pte) & PTE_ATTRINDX_MASK) == \ PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define pte_cont_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK; \ (__boundary - 1 < (end) - 1) ? __boundary : (end); \ }) #define pmd_cont_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + CONT_PMD_SIZE) & CONT_PMD_MASK; \ (__boundary - 1 < (end) - 1) ? __boundary : (end); \ }) #define pte_hw_dirty(pte) (pte_write(pte) && !pte_rdonly(pte)) #define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY)) #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) #define pte_valid(pte) (!!(pte_val(pte) & PTE_VALID)) #define pte_present_invalid(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_PRESENT_INVALID)) == PTE_PRESENT_INVALID) /* * Execute-only user mappings do not have the PTE_USER bit set. All valid * kernel mappings have the PTE_UXN bit set. */ #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN)) /* * Returns true if the pte is valid and has the contiguous bit set. */ #define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte)) /* * Could the pte be present in the TLB? We must check mm_tlb_flush_pending * so that we don't erroneously return false for pages that have been * remapped as PROT_NONE but are yet to be flushed from the TLB. * Note that we can't make any assumptions based on the state of the access * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the * TLB. */ #define pte_accessible(mm, pte) \ (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte)) static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute) { u64 por; if (!system_supports_poe()) return true; por = read_sysreg_s(SYS_POR_EL0); if (write) return por_elx_allows_write(por, pkey); if (execute) return por_elx_allows_exec(por, pkey); return por_elx_allows_read(por, pkey); } /* * p??_access_permitted() is true for valid user mappings (PTE_USER * bit set, subject to the write permission check). For execute-only * mappings, like PROT_EXEC with EPAN (both PTE_USER and PTE_UXN bits * not set) must return false. PROT_NONE mappings do not have the * PTE_VALID bit set. */ #define pte_access_permitted_no_overlay(pte, write) \ (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte))) #define pte_access_permitted(pte, write) \ (pte_access_permitted_no_overlay(pte, write) && \ por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false)) #define pmd_access_permitted(pmd, write) \ (pte_access_permitted(pmd_pte(pmd), (write))) #define pud_access_permitted(pud, write) \ (pte_access_permitted(pud_pte(pud), (write))) static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { pte_val(pte) &= ~pgprot_val(prot); return pte; } static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot) { pte_val(pte) |= pgprot_val(prot); return pte; } static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot) { pmd_val(pmd) &= ~pgprot_val(prot); return pmd; } static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot) { pmd_val(pmd) |= pgprot_val(prot); return pmd; } static inline pte_t pte_mkwrite_novma(pte_t pte) { pte = set_pte_bit(pte, __pgprot(PTE_WRITE)); pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); return pte; } static inline pte_t pte_mkclean(pte_t pte) { pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY)); pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); return pte; } static inline pte_t pte_mkdirty(pte_t pte) { pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); if (pte_write(pte)) pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); return pte; } static inline pte_t pte_wrprotect(pte_t pte) { /* * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY * clear), set the PTE_DIRTY bit. */ if (pte_hw_dirty(pte)) pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); return pte; } static inline pte_t pte_mkold(pte_t pte) { return clear_pte_bit(pte, __pgprot(PTE_AF)); } static inline pte_t pte_mkyoung(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_AF)); } static inline pte_t pte_mkspecial(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_SPECIAL)); } static inline pte_t pte_mkcont(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_CONT)); } static inline pte_t pte_mknoncont(pte_t pte) { return clear_pte_bit(pte, __pgprot(PTE_CONT)); } static inline pte_t pte_mkvalid(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_VALID)); } static inline pte_t pte_mkinvalid(pte_t pte) { pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID)); pte = clear_pte_bit(pte, __pgprot(PTE_VALID)); return pte; } static inline pmd_t pmd_mkcont(pmd_t pmd) { return __pmd(pmd_val(pmd) | PMD_SECT_CONT); } static inline pte_t pte_mkdevmap(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL)); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { return !!(pte_val(pte) & PTE_UFFD_WP); } static inline pte_t pte_mkuffd_wp(pte_t pte) { return pte_wrprotect(set_pte_bit(pte, __pgprot(PTE_UFFD_WP))); } static inline pte_t pte_clear_uffd_wp(pte_t pte) { return clear_pte_bit(pte, __pgprot(PTE_UFFD_WP)); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ static inline void __set_pte_nosync(pte_t *ptep, pte_t pte) { WRITE_ONCE(*ptep, pte); } static inline void __set_pte_complete(pte_t pte) { /* * Only if the new pte is valid and kernel, otherwise TLB maintenance * has the necessary barriers. */ if (pte_valid_not_user(pte)) queue_pte_barriers(); } static inline void __set_pte(pte_t *ptep, pte_t pte) { __set_pte_nosync(ptep, pte); __set_pte_complete(pte); } static inline pte_t __ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } extern void __sync_icache_dcache(pte_t pteval); bool pgattr_change_is_safe(pteval_t old, pteval_t new); /* * PTE bits configuration in the presence of hardware Dirty Bit Management * (PTE_WRITE == PTE_DBM): * * Dirty Writable | PTE_RDONLY PTE_WRITE PTE_DIRTY (sw) * 0 0 | 1 0 0 * 0 1 | 1 1 0 * 1 0 | 1 0 1 * 1 1 | 0 1 x * * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via * the page fault mechanism. Checking the dirty status of a pte becomes: * * PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY) */ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep, pte_t pte) { pte_t old_pte; if (!IS_ENABLED(CONFIG_DEBUG_VM)) return; old_pte = __ptep_get(ptep); if (!pte_valid(old_pte) || !pte_valid(pte)) return; if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1) return; /* * Check for potential race with hardware updates of the pte * (__ptep_set_access_flags safely changes valid ptes without going * through an invalid entry). */ VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(old_pte), pte_val(pte)); VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte), "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(old_pte), pte_val(pte)); VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)), "%s: unsafe attribute change: 0x%016llx -> 0x%016llx", __func__, pte_val(old_pte), pte_val(pte)); } static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages) { if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) __sync_icache_dcache(pte); /* * If the PTE would provide user space access to the tags associated * with it then ensure that the MTE tags are synchronised. Although * pte_access_permitted_no_overlay() returns false for exec only * mappings, they don't expose tags (instruction fetches don't check * tags). */ if (system_supports_mte() && pte_access_permitted_no_overlay(pte, false) && !pte_special(pte) && pte_tagged(pte)) mte_sync_tags(pte, nr_pages); } /* * Select all bits except the pfn */ #define pte_pgprot pte_pgprot static inline pgprot_t pte_pgprot(pte_t pte) { unsigned long pfn = pte_pfn(pte); return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); } #define pte_advance_pfn pte_advance_pfn static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) { return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } /* * Hugetlb definitions. */ #define HUGE_MAX_HSTATE 4 #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) static inline pte_t pgd_pte(pgd_t pgd) { return __pte(pgd_val(pgd)); } static inline pte_t p4d_pte(p4d_t p4d) { return __pte(p4d_val(p4d)); } static inline pte_t pud_pte(pud_t pud) { return __pte(pud_val(pud)); } static inline pud_t pte_pud(pte_t pte) { return __pud(pte_val(pte)); } static inline pmd_t pud_pmd(pud_t pud) { return __pmd(pud_val(pud)); } static inline pte_t pmd_pte(pmd_t pmd) { return __pte(pmd_val(pmd)); } static inline pmd_t pte_pmd(pte_t pte) { return __pmd(pte_val(pte)); } static inline pgprot_t mk_pud_sect_prot(pgprot_t prot) { return __pgprot((pgprot_val(prot) & ~PUD_TYPE_MASK) | PUD_TYPE_SECT); } static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot) { return __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); } static inline pte_t pte_swp_mkexclusive(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); } static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & PTE_SWP_EXCLUSIVE; } static inline pte_t pte_swp_clear_exclusive(pte_t pte) { return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); } #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline pte_t pte_swp_mkuffd_wp(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP)); } static inline int pte_swp_uffd_wp(pte_t pte) { return !!(pte_val(pte) & PTE_SWP_UFFD_WP); } static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) { return clear_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP)); } #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #ifdef CONFIG_NUMA_BALANCING /* * See the comment in include/linux/pgtable.h */ static inline int pte_protnone(pte_t pte) { /* * pte_present_invalid() tells us that the pte is invalid from HW * perspective but present from SW perspective, so the fields are to be * interpretted as per the HW layout. The second 2 checks are the unique * encoding that we use for PROT_NONE. It is insufficient to only use * the first check because we share the same encoding scheme with pmds * which support pmd_mkinvalid(), so can be present-invalid without * being PROT_NONE. */ return pte_present_invalid(pte) && !pte_user(pte) && !pte_user_exec(pte); } static inline int pmd_protnone(pmd_t pmd) { return pte_protnone(pmd_pte(pmd)); } #endif #define pmd_present(pmd) pte_present(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) #define pmd_valid(pmd) pte_valid(pmd_pte(pmd)) #define pmd_user(pmd) pte_user(pmd_pte(pmd)) #define pmd_user_exec(pmd) pte_user_exec(pmd_pte(pmd)) #define pmd_cont(pmd) pte_cont(pmd_pte(pmd)) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkwrite_novma(pmd) pte_pmd(pte_mkwrite_novma(pmd_pte(pmd))) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd))) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP #define pmd_uffd_wp(pmd) pte_uffd_wp(pmd_pte(pmd)) #define pmd_mkuffd_wp(pmd) pte_pmd(pte_mkuffd_wp(pmd_pte(pmd))) #define pmd_clear_uffd_wp(pmd) pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd))) #define pmd_swp_uffd_wp(pmd) pte_swp_uffd_wp(pmd_pte(pmd)) #define pmd_swp_mkuffd_wp(pmd) pte_pmd(pte_swp_mkuffd_wp(pmd_pte(pmd))) #define pmd_swp_clear_uffd_wp(pmd) \ pte_pmd(pte_swp_clear_uffd_wp(pmd_pte(pmd))) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ #define pmd_write(pmd) pte_write(pmd_pte(pmd)) static inline pmd_t pmd_mkhuge(pmd_t pmd) { /* * It's possible that the pmd is present-invalid on entry * and in that case it needs to remain present-invalid on * exit. So ensure the VALID bit does not get modified. */ pmdval_t mask = PMD_TYPE_MASK & ~PTE_VALID; pmdval_t val = PMD_TYPE_SECT & ~PTE_VALID; return __pmd((pmd_val(pmd) & ~mask) | val); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_devmap(pmd) pte_devmap(pmd_pte(pmd)) #endif static inline pmd_t pmd_mkdevmap(pmd_t pmd) { return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP))); } #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP #define pmd_special(pte) (!!((pmd_val(pte) & PTE_SPECIAL))) static inline pmd_t pmd_mkspecial(pmd_t pmd) { return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL)); } #endif #define __pmd_to_phys(pmd) __pte_to_phys(pmd_pte(pmd)) #define __phys_to_pmd_val(phys) __phys_to_pte_val(phys) #define pmd_pfn(pmd) ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT) #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pud_young(pud) pte_young(pud_pte(pud)) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) static inline pud_t pud_mkhuge(pud_t pud) { /* * It's possible that the pud is present-invalid on entry * and in that case it needs to remain present-invalid on * exit. So ensure the VALID bit does not get modified. */ pudval_t mask = PUD_TYPE_MASK & ~PTE_VALID; pudval_t val = PUD_TYPE_SECT & ~PTE_VALID; return __pud((pud_val(pud) & ~mask) | val); } #define __pud_to_phys(pud) __pte_to_phys(pud_pte(pud)) #define __phys_to_pud_val(phys) __phys_to_pte_val(phys) #define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT) #define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pmd_pgprot pmd_pgprot static inline pgprot_t pmd_pgprot(pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); } #define pud_pgprot pud_pgprot static inline pgprot_t pud_pgprot(pud_t pud) { unsigned long pfn = pud_pfn(pud); return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); } static inline void __set_ptes_anysz(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr, unsigned long pgsize) { unsigned long stride = pgsize >> PAGE_SHIFT; switch (pgsize) { case PAGE_SIZE: page_table_check_ptes_set(mm, ptep, pte, nr); break; case PMD_SIZE: page_table_check_pmds_set(mm, (pmd_t *)ptep, pte_pmd(pte), nr); break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: page_table_check_puds_set(mm, (pud_t *)ptep, pte_pud(pte), nr); break; #endif default: VM_WARN_ON(1); } __sync_cache_and_tags(pte, nr * stride); for (;;) { __check_safe_pte_update(mm, ptep, pte); __set_pte_nosync(ptep, pte); if (--nr == 0) break; ptep++; pte = pte_advance_pfn(pte, stride); } __set_pte_complete(pte); } static inline void __set_ptes(struct mm_struct *mm, unsigned long __always_unused addr, pte_t *ptep, pte_t pte, unsigned int nr) { __set_ptes_anysz(mm, ptep, pte, nr, PAGE_SIZE); } static inline void __set_pmds(struct mm_struct *mm, unsigned long __always_unused addr, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { __set_ptes_anysz(mm, (pte_t *)pmdp, pmd_pte(pmd), nr, PMD_SIZE); } #define set_pmd_at(mm, addr, pmdp, pmd) __set_pmds(mm, addr, pmdp, pmd, 1) static inline void __set_puds(struct mm_struct *mm, unsigned long __always_unused addr, pud_t *pudp, pud_t pud, unsigned int nr) { __set_ptes_anysz(mm, (pte_t *)pudp, pud_pte(pud), nr, PUD_SIZE); } #define set_pud_at(mm, addr, pudp, pud) __set_puds(mm, addr, pudp, pud, 1) #define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) #define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) #define __pgd_to_phys(pgd) __pte_to_phys(pgd_pte(pgd)) #define __phys_to_pgd_val(phys) __phys_to_pte_val(phys) #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) #define pgprot_nx(prot) \ __pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN) #define pgprot_decrypted(prot) \ __pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED) #define pgprot_encrypted(prot) \ __pgprot_modify(prot, PROT_NS_SHARED, 0) /* * Mark the prot value as uncacheable and unbufferable. */ #define pgprot_noncached(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE) | PTE_PXN | PTE_UXN) #define pgprot_writecombine(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) #define pgprot_device(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN) #define pgprot_tagged(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED)) #define pgprot_mhp pgprot_tagged /* * DMA allocations for non-coherent devices use what the Arm architecture calls * "Normal non-cacheable" memory, which permits speculation, unaligned accesses * and merging of writes. This is different from "Device-nGnR[nE]" memory which * is intended for MMIO and thus forbids speculation, preserves access size, * requires strict alignment and can also force write responses to come from the * endpoint. */ #define pgprot_dmacoherent(prot) \ __pgprot_modify(prot, PTE_ATTRINDX_MASK, \ PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN) #define __HAVE_PHYS_MEM_ACCESS_PROT struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_TABLE) #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) #define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd)) #define pmd_bad(pmd) (!pmd_table(pmd)) #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE) #define pte_leaf_size(pte) (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE) #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { /* * If pmd is present-invalid, pmd_table() won't detect it * as a table, so force the valid bit for the comparison. */ return pmd_present(pmd) && !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3 static inline bool pud_sect(pud_t pud) { return false; } static inline bool pud_table(pud_t pud) { return true; } #else #define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ PUD_TYPE_SECT) #define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ PUD_TYPE_TABLE) #endif extern pgd_t swapper_pg_dir[]; extern pgd_t idmap_pg_dir[]; extern pgd_t tramp_pg_dir[]; extern pgd_t reserved_pg_dir[]; extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd); static inline bool in_swapper_pgdir(void *addr) { return ((unsigned long)addr & PAGE_MASK) == ((unsigned long)swapper_pg_dir & PAGE_MASK); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { #ifdef __PAGETABLE_PMD_FOLDED if (in_swapper_pgdir(pmdp)) { set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd))); return; } #endif /* __PAGETABLE_PMD_FOLDED */ WRITE_ONCE(*pmdp, pmd); if (pmd_valid(pmd)) queue_pte_barriers(); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, __pmd(0)); } static inline phys_addr_t pmd_page_paddr(pmd_t pmd) { return __pmd_to_phys(pmd); } static inline unsigned long pmd_page_vaddr(pmd_t pmd) { return (unsigned long)__va(pmd_page_paddr(pmd)); } /* Find an entry in the third-level page table. */ #define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t)) #define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr)) #define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) #define pte_clear_fixmap() clear_fixmap(FIX_PTE) #define pmd_page(pmd) phys_to_page(__pmd_to_phys(pmd)) /* use ONLY for statically allocated translation tables */ #define pte_offset_kimg(dir,addr) ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr)))) #if CONFIG_PGTABLE_LEVELS > 2 #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) #define pud_bad(pud) ((pud_val(pud) & PUD_TYPE_MASK) != \ PUD_TYPE_TABLE) #define pud_present(pud) pte_present(pud_pte(pud)) #ifndef __PAGETABLE_PMD_FOLDED #define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #else #define pud_leaf(pud) false #endif #define pud_valid(pud) pte_valid(pud_pte(pud)) #define pud_user(pud) pte_user(pud_pte(pud)) #define pud_user_exec(pud) pte_user_exec(pud_pte(pud)) static inline bool pgtable_l4_enabled(void); static inline void set_pud(pud_t *pudp, pud_t pud) { if (!pgtable_l4_enabled() && in_swapper_pgdir(pudp)) { set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud))); return; } WRITE_ONCE(*pudp, pud); if (pud_valid(pud)) queue_pte_barriers(); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, __pud(0)); } static inline phys_addr_t pud_page_paddr(pud_t pud) { return __pud_to_phys(pud); } static inline pmd_t *pud_pgtable(pud_t pud) { return (pmd_t *)__va(pud_page_paddr(pud)); } /* Find an entry in the second-level page table. */ #define pmd_offset_phys(dir, addr) (pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t)) #define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr)) #define pmd_set_fixmap_offset(pud, addr) pmd_set_fixmap(pmd_offset_phys(pud, addr)) #define pmd_clear_fixmap() clear_fixmap(FIX_PMD) #define pud_page(pud) phys_to_page(__pud_to_phys(pud)) /* use ONLY for statically allocated translation tables */ #define pmd_offset_kimg(dir,addr) ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr)))) #else #define pud_valid(pud) false #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) #define pud_user_exec(pud) pud_user(pud) /* Always 0 with folding */ /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */ #define pmd_set_fixmap(addr) NULL #define pmd_set_fixmap_offset(pudp, addr) ((pmd_t *)pudp) #define pmd_clear_fixmap() #define pmd_offset_kimg(dir,addr) ((pmd_t *)dir) #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #if CONFIG_PGTABLE_LEVELS > 3 static __always_inline bool pgtable_l4_enabled(void) { if (CONFIG_PGTABLE_LEVELS > 4 || !IS_ENABLED(CONFIG_ARM64_LPA2)) return true; if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT)) return vabits_actual == VA_BITS; return alternative_has_cap_unlikely(ARM64_HAS_VA52); } static inline bool mm_pud_folded(const struct mm_struct *mm) { return !pgtable_l4_enabled(); } #define mm_pud_folded mm_pud_folded #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e)) #define p4d_none(p4d) (pgtable_l4_enabled() && !p4d_val(p4d)) #define p4d_bad(p4d) (pgtable_l4_enabled() && \ ((p4d_val(p4d) & P4D_TYPE_MASK) != \ P4D_TYPE_TABLE)) #define p4d_present(p4d) (!p4d_none(p4d)) static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { if (in_swapper_pgdir(p4dp)) { set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d))); return; } WRITE_ONCE(*p4dp, p4d); queue_pte_barriers(); } static inline void p4d_clear(p4d_t *p4dp) { if (pgtable_l4_enabled()) set_p4d(p4dp, __p4d(0)); } static inline phys_addr_t p4d_page_paddr(p4d_t p4d) { return __p4d_to_phys(p4d); } #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr) { /* Ensure that 'p4dp' indexes a page table according to 'addr' */ VM_BUG_ON(((addr >> P4D_SHIFT) ^ ((u64)p4dp >> 3)) % PTRS_PER_P4D); return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr); } static inline pud_t *p4d_pgtable(p4d_t p4d) { return (pud_t *)__va(p4d_page_paddr(p4d)); } static inline phys_addr_t pud_offset_phys(p4d_t *p4dp, unsigned long addr) { BUG_ON(!pgtable_l4_enabled()); return p4d_page_paddr(READ_ONCE(*p4dp)) + pud_index(addr) * sizeof(pud_t); } static inline pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long addr) { if (!pgtable_l4_enabled()) return p4d_to_folded_pud(p4dp, addr); return (pud_t *)__va(p4d_page_paddr(p4d)) + pud_index(addr); } #define pud_offset_lockless pud_offset_lockless static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long addr) { return pud_offset_lockless(p4dp, READ_ONCE(*p4dp), addr); } #define pud_offset pud_offset static inline pud_t *pud_set_fixmap(unsigned long addr) { if (!pgtable_l4_enabled()) return NULL; return (pud_t *)set_fixmap_offset(FIX_PUD, addr); } static inline pud_t *pud_set_fixmap_offset(p4d_t *p4dp, unsigned long addr) { if (!pgtable_l4_enabled()) return p4d_to_folded_pud(p4dp, addr); return pud_set_fixmap(pud_offset_phys(p4dp, addr)); } static inline void pud_clear_fixmap(void) { if (pgtable_l4_enabled()) clear_fixmap(FIX_PUD); } /* use ONLY for statically allocated translation tables */ static inline pud_t *pud_offset_kimg(p4d_t *p4dp, u64 addr) { if (!pgtable_l4_enabled()) return p4d_to_folded_pud(p4dp, addr); return (pud_t *)__phys_to_kimg(pud_offset_phys(p4dp, addr)); } #define p4d_page(p4d) pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d))) #else static inline bool pgtable_l4_enabled(void) { return false; } #define p4d_page_paddr(p4d) ({ BUILD_BUG(); 0;}) /* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */ #define pud_set_fixmap(addr) NULL #define pud_set_fixmap_offset(pgdp, addr) ((pud_t *)pgdp) #define pud_clear_fixmap() #define pud_offset_kimg(dir,addr) ((pud_t *)dir) #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #if CONFIG_PGTABLE_LEVELS > 4 static __always_inline bool pgtable_l5_enabled(void) { if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT)) return vabits_actual == VA_BITS; return alternative_has_cap_unlikely(ARM64_HAS_VA52); } static inline bool mm_p4d_folded(const struct mm_struct *mm) { return !pgtable_l5_enabled(); } #define mm_p4d_folded mm_p4d_folded #define p4d_ERROR(e) \ pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e)) #define pgd_none(pgd) (pgtable_l5_enabled() && !pgd_val(pgd)) #define pgd_bad(pgd) (pgtable_l5_enabled() && \ ((pgd_val(pgd) & PGD_TYPE_MASK) != \ PGD_TYPE_TABLE)) #define pgd_present(pgd) (!pgd_none(pgd)) static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { if (in_swapper_pgdir(pgdp)) { set_swapper_pgd(pgdp, __pgd(pgd_val(pgd))); return; } WRITE_ONCE(*pgdp, pgd); queue_pte_barriers(); } static inline void pgd_clear(pgd_t *pgdp) { if (pgtable_l5_enabled()) set_pgd(pgdp, __pgd(0)); } static inline phys_addr_t pgd_page_paddr(pgd_t pgd) { return __pgd_to_phys(pgd); } #define p4d_index(addr) (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1)) static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr) { /* Ensure that 'pgdp' indexes a page table according to 'addr' */ VM_BUG_ON(((addr >> PGDIR_SHIFT) ^ ((u64)pgdp >> 3)) % PTRS_PER_PGD); return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr); } static inline phys_addr_t p4d_offset_phys(pgd_t *pgdp, unsigned long addr) { BUG_ON(!pgtable_l5_enabled()); return pgd_page_paddr(READ_ONCE(*pgdp)) + p4d_index(addr) * sizeof(p4d_t); } static inline p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long addr) { if (!pgtable_l5_enabled()) return pgd_to_folded_p4d(pgdp, addr); return (p4d_t *)__va(pgd_page_paddr(pgd)) + p4d_index(addr); } #define p4d_offset_lockless p4d_offset_lockless static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long addr) { return p4d_offset_lockless(pgdp, READ_ONCE(*pgdp), addr); } static inline p4d_t *p4d_set_fixmap(unsigned long addr) { if (!pgtable_l5_enabled()) return NULL; return (p4d_t *)set_fixmap_offset(FIX_P4D, addr); } static inline p4d_t *p4d_set_fixmap_offset(pgd_t *pgdp, unsigned long addr) { if (!pgtable_l5_enabled()) return pgd_to_folded_p4d(pgdp, addr); return p4d_set_fixmap(p4d_offset_phys(pgdp, addr)); } static inline void p4d_clear_fixmap(void) { if (pgtable_l5_enabled()) clear_fixmap(FIX_P4D); } /* use ONLY for statically allocated translation tables */ static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr) { if (!pgtable_l5_enabled()) return pgd_to_folded_p4d(pgdp, addr); return (p4d_t *)__phys_to_kimg(p4d_offset_phys(pgdp, addr)); } #define pgd_page(pgd) pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd))) #else static inline bool pgtable_l5_enabled(void) { return false; } #define p4d_index(addr) (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1)) /* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */ #define p4d_set_fixmap(addr) NULL #define p4d_set_fixmap_offset(p4dp, addr) ((p4d_t *)p4dp) #define p4d_clear_fixmap() #define p4d_offset_kimg(dir,addr) ((p4d_t *)dir) static inline p4d_t *p4d_offset_lockless_folded(pgd_t *pgdp, pgd_t pgd, unsigned long addr) { /* * With runtime folding of the pud, pud_offset_lockless() passes * the 'pgd_t *' we return here to p4d_to_folded_pud(), which * will offset the pointer assuming that it points into * a page-table page. However, the fast GUP path passes us a * pgd_t allocated on the stack and so we must use the original * pointer in 'pgdp' to construct the p4d pointer instead of * using the generic p4d_offset_lockless() implementation. * * Note: reusing the original pointer means that we may * dereference the same (live) page-table entry multiple times. * This is safe because it is still only loaded once in the * context of each level and the CPU guarantees same-address * read-after-read ordering. */ return p4d_offset(pgdp, addr); } #define p4d_offset_lockless p4d_offset_lockless_folded #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e)) #define pgd_set_fixmap(addr) ((pgd_t *)set_fixmap_offset(FIX_PGD, addr)) #define pgd_clear_fixmap() clear_fixmap(FIX_PGD) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { /* * Normal and Normal-Tagged are two different memory types and indices * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK. */ const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY | PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE | PTE_GP | PTE_ATTRINDX_MASK | PTE_PO_IDX_MASK; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) pte = set_pte_bit(pte, __pgprot(PTE_DIRTY)); pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask); /* * If we end up clearing hw dirtiness for a sw-dirty PTE, set hardware * dirtiness again. */ if (pte_sw_dirty(pte)) pte = pte_mkdirty(pte); return pte; } static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty); } static inline int pud_devmap(pud_t pud) { return 0; } static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #ifdef CONFIG_PAGE_TABLE_CHECK static inline bool pte_user_accessible_page(pte_t pte) { return pte_valid(pte) && (pte_user(pte) || pte_user_exec(pte)); } static inline bool pmd_user_accessible_page(pmd_t pmd) { return pmd_valid(pmd) && !pmd_table(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); } static inline bool pud_user_accessible_page(pud_t pud) { return pud_valid(pud) && !pud_table(pud) && (pud_user(pud) || pud_user_exec(pud)); } #endif /* * Atomic pte/pmd modifications. */ static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t old_pte, pte; pte = __ptep_get(ptep); do { old_pte = pte; pte = pte_mkold(pte); pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), pte_val(old_pte), pte_val(pte)); } while (pte_val(pte) != pte_val(old_pte)); return pte_young(pte); } static inline int __ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { int young = __ptep_test_and_clear_young(vma, address, ptep); if (young) { /* * We can elide the trailing DSB here since the worst that can * happen is that a CPU continues to use the young entry in its * TLB and we mistakenly reclaim the associated page. The * window for such an event is bounded by the next * context-switch, which provides a DSB to complete the TLB * invalidation. */ flush_tlb_page_nosync(vma, address); } return young; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { /* Operation applies to PMD table entry only if FEAT_HAFT is enabled */ VM_WARN_ON(pmd_table(READ_ONCE(*pmdp)) && !system_supports_haft()); return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, pte_t *ptep, unsigned long pgsize) { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); switch (pgsize) { case PAGE_SIZE: page_table_check_pte_clear(mm, pte); break; case PMD_SIZE: page_table_check_pmd_clear(mm, pte_pmd(pte)); break; #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: page_table_check_pud_clear(mm, pte_pud(pte)); break; #endif default: VM_WARN_ON(1); } return pte; } static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { return __ptep_get_and_clear_anysz(mm, ptep, PAGE_SIZE); } static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { for (;;) { __ptep_get_and_clear(mm, addr, ptep); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte, tmp_pte; pte = __ptep_get_and_clear(mm, addr, ptep); while (--nr) { ptep++; addr += PAGE_SIZE; tmp_pte = __ptep_get_and_clear(mm, addr, ptep); if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } return pte; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { return pte_pmd(__ptep_get_and_clear_anysz(mm, (pte_t *)pmdp, PMD_SIZE)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void ___ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep, pte_t pte) { pte_t old_pte; do { old_pte = pte; pte = pte_wrprotect(pte); pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), pte_val(old_pte), pte_val(pte)); } while (pte_val(pte) != pte_val(old_pte)); } /* * __ptep_set_wrprotect - mark read-only while transferring potential hardware * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. */ static inline void __ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep)); } static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address, pte_t *ptep, unsigned int nr) { unsigned int i; for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++) __ptep_set_wrprotect(mm, address, ptep); } static inline void __clear_young_dirty_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, cydp_t flags) { pte_t old_pte; do { old_pte = pte; if (flags & CYDP_CLEAR_YOUNG) pte = pte_mkold(pte); if (flags & CYDP_CLEAR_DIRTY) pte = pte_mkclean(pte); pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), pte_val(old_pte), pte_val(pte)); } while (pte_val(pte) != pte_val(old_pte)); } static inline void __clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { pte_t pte; for (;;) { pte = __ptep_get(ptep); if (flags == (CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY)) __set_pte(ptep, pte_mkclean(pte_mkold(pte))); else __clear_young_dirty_pte(vma, addr, ptep, pte, flags); if (--nr == 0) break; ptep++; addr += PAGE_SIZE; } } #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { __ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #define pmdp_establish pmdp_establish static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif /* * Encode and decode a swap entry: * bits 0-1: present (must be zero) * bits 2: remember PG_anon_exclusive * bit 3: remember uffd-wp state * bits 6-10: swap type * bit 11: PTE_PRESENT_INVALID (must be zero) * bits 12-61: swap offset */ #define __SWP_TYPE_SHIFT 6 #define __SWP_TYPE_BITS 5 #define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1) #define __SWP_OFFSET_SHIFT 12 #define __SWP_OFFSET_BITS 50 #define __SWP_OFFSET_MASK ((1UL << __SWP_OFFSET_BITS) - 1) #define __swp_type(x) (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK) #define __swp_offset(x) (((x).val >> __SWP_OFFSET_SHIFT) & __SWP_OFFSET_MASK) #define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(swp) __pmd((swp).val) #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ /* * Ensure that there are not more swap files than can be encoded in the kernel * PTEs. */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) #ifdef CONFIG_ARM64_MTE #define __HAVE_ARCH_PREPARE_TO_SWAP extern int arch_prepare_to_swap(struct folio *folio); #define __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { if (system_supports_mte()) mte_invalidate_tags(type, offset); } static inline void arch_swap_invalidate_area(int type) { if (system_supports_mte()) mte_invalidate_tags_area(type); } #define __HAVE_ARCH_SWAP_RESTORE extern void arch_swap_restore(swp_entry_t entry, struct folio *folio); #endif /* CONFIG_ARM64_MTE */ /* * On AArch64, the cache coherency is handled via the __set_ptes() function. */ static inline void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr) { /* * We don't do anything here, so there's a very small chance of * us retaking a user fault which we just fixed up. The alternative * is doing a dsb(ishst), but that penalises the fastpath. */ } #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #ifdef CONFIG_ARM64_PA_BITS_52 #define phys_to_ttbr(addr) (((addr) | ((addr) >> 46)) & TTBR_BADDR_MASK_52) #else #define phys_to_ttbr(addr) (addr) #endif /* * On arm64 without hardware Access Flag, copying from user will fail because * the pte is old and cannot be marked young. So we always end up with zeroed * page after fork() + CoW for pfn mappings. We don't always have a * hardware-managed access flag on arm64. */ #define arch_has_hw_pte_young cpu_has_hw_af #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG #define arch_has_hw_nonleaf_pmd_young system_supports_haft #endif /* * Experimentally, it's cheap to set the access flag in hardware and we * benefit from prefaulting mappings as 'old' to start with. */ #define arch_wants_old_prefaulted_pte cpu_has_hw_af static inline bool pud_sect_supported(void) { return PAGE_SIZE == SZ_4K; } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION #define ptep_modify_prot_start ptep_modify_prot_start extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); #define ptep_modify_prot_commit ptep_modify_prot_commit extern void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t new_pte); #ifdef CONFIG_ARM64_CONTPTE /* * The contpte APIs are used to transparently manage the contiguous bit in ptes * where it is possible and makes sense to do so. The PTE_CONT bit is considered * a private implementation detail of the public ptep API (see below). */ extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte); extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep); extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr); extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full); extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full); extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr); extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty); extern void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags); static __always_inline void contpte_try_fold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { /* * Only bother trying if both the virtual and physical addresses are * aligned and correspond to the last entry in a contig range. The core * code mostly modifies ranges from low to high, so this is the likely * the last modification in the contig range, so a good time to fold. * We can't fold special mappings, because there is no associated folio. */ const unsigned long contmask = CONT_PTES - 1; bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask; if (unlikely(valign)) { bool palign = (pte_pfn(pte) & contmask) == contmask; if (unlikely(palign && pte_valid(pte) && !pte_cont(pte) && !pte_special(pte))) __contpte_try_fold(mm, addr, ptep, pte); } } static __always_inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { if (unlikely(pte_valid_cont(pte))) __contpte_try_unfold(mm, addr, ptep, pte); } #define pte_batch_hint pte_batch_hint static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte) { if (!pte_valid_cont(pte)) return 1; return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1)); } /* * The below functions constitute the public API that arm64 presents to the * core-mm to manipulate PTE entries within their page tables (or at least this * is the subset of the API that arm64 needs to implement). These public * versions will automatically and transparently apply the contiguous bit where * it makes sense to do so. Therefore any users that are contig-aware (e.g. * hugetlb, kernel mapper) should NOT use these APIs, but instead use the * private versions, which are prefixed with double underscore. All of these * APIs except for ptep_get_lockless() are expected to be called with the PTL * held. Although the contiguous bit is considered private to the * implementation, it is deliberately allowed to leak through the getters (e.g. * ptep_get()), back to core code. This is required so that pte_leaf_size() can * provide an accurate size for perf_get_pgtable_size(). But this leakage means * its possible a pte will be passed to a setter with the contiguous bit set, so * we explicitly clear the contiguous bit in those cases to prevent accidentally * setting it in the pgtable. */ #define ptep_get ptep_get static inline pte_t ptep_get(pte_t *ptep) { pte_t pte = __ptep_get(ptep); if (likely(!pte_valid_cont(pte))) return pte; return contpte_ptep_get(ptep, pte); } #define ptep_get_lockless ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte = __ptep_get(ptep); if (likely(!pte_valid_cont(pte))) return pte; return contpte_ptep_get_lockless(ptep); } static inline void set_pte(pte_t *ptep, pte_t pte) { /* * We don't have the mm or vaddr so cannot unfold contig entries (since * it requires tlb maintenance). set_pte() is not used in core code, so * this should never even be called. Regardless do our best to service * any call and emit a warning if there is any attempt to set a pte on * top of an existing contig range. */ pte_t orig_pte = __ptep_get(ptep); WARN_ON_ONCE(pte_valid_cont(orig_pte)); __set_pte(ptep, pte_mknoncont(pte)); } #define set_ptes set_ptes static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { pte = pte_mknoncont(pte); if (likely(nr == 1)) { contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); __set_ptes(mm, addr, ptep, pte, 1); contpte_try_fold(mm, addr, ptep, pte); } else { contpte_set_ptes(mm, addr, ptep, pte, nr); } } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); __pte_clear(mm, addr, ptep); } #define clear_full_ptes clear_full_ptes static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { if (likely(nr == 1)) { contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); __clear_full_ptes(mm, addr, ptep, nr, full); } else { contpte_clear_full_ptes(mm, addr, ptep, nr, full); } } #define get_and_clear_full_ptes get_and_clear_full_ptes static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { pte_t pte; if (likely(nr == 1)) { contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full); } else { pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full); } return pte; } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep)); return __ptep_get_and_clear(mm, addr, ptep); } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t orig_pte = __ptep_get(ptep); if (likely(!pte_valid_cont(orig_pte))) return __ptep_test_and_clear_young(vma, addr, ptep); return contpte_ptep_test_and_clear_young(vma, addr, ptep); } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH static inline int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t orig_pte = __ptep_get(ptep); if (likely(!pte_valid_cont(orig_pte))) return __ptep_clear_flush_young(vma, addr, ptep); return contpte_ptep_clear_flush_young(vma, addr, ptep); } #define wrprotect_ptes wrprotect_ptes static __always_inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr) { if (likely(nr == 1)) { /* * Optimization: wrprotect_ptes() can only be called for present * ptes so we only need to check contig bit as condition for * unfold, and we can remove the contig bit from the pte we read * to avoid re-reading. This speeds up fork() which is sensitive * for order-0 folios. Equivalent to contpte_try_unfold(). */ pte_t orig_pte = __ptep_get(ptep); if (unlikely(pte_cont(orig_pte))) { __contpte_try_unfold(mm, addr, ptep, orig_pte); orig_pte = pte_mknoncont(orig_pte); } ___ptep_set_wrprotect(mm, addr, ptep, orig_pte); } else { contpte_wrprotect_ptes(mm, addr, ptep, nr); } } #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { wrprotect_ptes(mm, addr, ptep, 1); } #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS static inline int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t entry, int dirty) { pte_t orig_pte = __ptep_get(ptep); entry = pte_mknoncont(entry); if (likely(!pte_valid_cont(orig_pte))) return __ptep_set_access_flags(vma, addr, ptep, entry, dirty); return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty); } #define clear_young_dirty_ptes clear_young_dirty_ptes static inline void clear_young_dirty_ptes(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, unsigned int nr, cydp_t flags) { if (likely(nr == 1 && !pte_cont(__ptep_get(ptep)))) __clear_young_dirty_ptes(vma, addr, ptep, nr, flags); else contpte_clear_young_dirty_ptes(vma, addr, ptep, nr, flags); } #else /* CONFIG_ARM64_CONTPTE */ #define ptep_get __ptep_get #define set_pte __set_pte #define set_ptes __set_ptes #define pte_clear __pte_clear #define clear_full_ptes __clear_full_ptes #define get_and_clear_full_ptes __get_and_clear_full_ptes #define __HAVE_ARCH_PTEP_GET_AND_CLEAR #define ptep_get_and_clear __ptep_get_and_clear #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG #define ptep_test_and_clear_young __ptep_test_and_clear_young #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH #define ptep_clear_flush_young __ptep_clear_flush_young #define __HAVE_ARCH_PTEP_SET_WRPROTECT #define ptep_set_wrprotect __ptep_set_wrprotect #define wrprotect_ptes __wrprotect_ptes #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS #define ptep_set_access_flags __ptep_set_access_flags #define clear_young_dirty_ptes __clear_young_dirty_ptes #endif /* CONFIG_ARM64_CONTPTE */ #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */
5 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FILEATTR_H #define _LINUX_FILEATTR_H /* Flags shared betwen flags/xflags */ #define FS_COMMON_FL \ (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \ FS_NODUMP_FL | FS_NOATIME_FL | FS_DAX_FL | \ FS_PROJINHERIT_FL) #define FS_XFLAG_COMMON \ (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | \ FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \ FS_XFLAG_PROJINHERIT) /* * Merged interface for miscellaneous file attributes. 'flags' originates from * ext* and 'fsx_flags' from xfs. There's some overlap between the two, which * is handled by the VFS helpers, so filesystems are free to implement just one * or both of these sub-interfaces. */ struct fileattr { u32 flags; /* flags (FS_IOC_GETFLAGS/FS_IOC_SETFLAGS) */ /* struct fsxattr: */ u32 fsx_xflags; /* xflags field value (get/set) */ u32 fsx_extsize; /* extsize field value (get/set)*/ u32 fsx_nextents; /* nextents field value (get) */ u32 fsx_projid; /* project identifier (get/set) */ u32 fsx_cowextsize; /* CoW extsize field value (get/set)*/ /* selectors: */ bool flags_valid:1; bool fsx_valid:1; }; int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa); void fileattr_fill_xflags(struct fileattr *fa, u32 xflags); void fileattr_fill_flags(struct fileattr *fa, u32 flags); /** * fileattr_has_fsx - check for extended flags/attributes * @fa: fileattr pointer * * Return: true if any attributes are present that are not represented in * ->flags. */ static inline bool fileattr_has_fsx(const struct fileattr *fa) { return fa->fsx_valid && ((fa->fsx_xflags & ~FS_XFLAG_COMMON) || fa->fsx_extsize != 0 || fa->fsx_projid != 0 || fa->fsx_cowextsize != 0); } int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); #endif /* _LINUX_FILEATTR_H */
118 119 118 118 1140 1138 1140 1143 1140 103 1139 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 // SPDX-License-Identifier: GPL-2.0 #include <linux/irq_work.h> #include <linux/spinlock.h> #include <linux/task_work.h> #include <linux/resume_user_mode.h> static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); #endif /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback * @work: the callback to run * @notify: how to notify the targeted task * * Queue @work for task_work_run() below and notify the @task if @notify * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT. * * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted * task and run the task_work, regardless of whether the task is currently * running in the kernel or userspace. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a * reschedule IPI to force the targeted task to reschedule and run task_work. * This can be advantageous if there's no strict requirement that the * task_work be run as soon as possible, just whenever the task enters the * kernel anyway. * @TWA_RESUME work is run only when the task exits the kernel and returns to * user mode, or before entering guest mode. * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the * current @task and if the current context is NMI. * * Fails if the @task is exiting/exited and thus it can't process this @work. * Otherwise @work->func() will be called when the @task goes through one of * the aforementioned transitions, or exits. * * If the targeted task is exiting, then an error is returned and the work item * is not queued. It's up to the caller to arrange for an alternative mechanism * in that case. * * Note: there is no ordering guarantee on works queued here. The task_work * list is LIFO. * * RETURNS: * 0 if succeeds or -ESRCH. */ int task_work_add(struct task_struct *task, struct callback_head *work, enum task_work_notify_mode notify) { struct callback_head *head; if (notify == TWA_NMI_CURRENT) { if (WARN_ON_ONCE(task != current)) return -EINVAL; if (!IS_ENABLED(CONFIG_IRQ_WORK)) return -EINVAL; } else { kasan_record_aux_stack(work); } head = READ_ONCE(task->task_works); do { if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: break; case TWA_RESUME: set_notify_resume(task); break; case TWA_SIGNAL: set_notify_signal(task); break; case TWA_SIGNAL_NO_IPI: __set_notify_signal(task); break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif default: WARN_ON_ONCE(1); break; } return 0; } /** * task_work_cancel_match - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @match: match function to call * @data: data to be passed in to match function * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_match(struct task_struct *task, bool (*match)(struct callback_head *, void *data), void *data) { struct callback_head **pprev = &task->task_works; struct callback_head *work; unsigned long flags; if (likely(!task_work_pending(task))) return NULL; /* * If cmpxchg() fails we continue without updating pprev. * Either we raced with task_work_add() which added the * new entry before this work, we will find it again. Or * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); work = READ_ONCE(*pprev); while (work) { if (!match(work, data)) { pprev = &work->next; work = READ_ONCE(*pprev); } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); return work; } static bool task_work_func_match(struct callback_head *cb, void *data) { return cb->func == data; } /** * task_work_cancel_func - cancel a pending work matching a function added by task_work_add() * @task: the task which should execute the func's work * @func: identifies the func to match with a work to remove * * Find the last queued pending work with ->func == @func and remove * it from queue. * * RETURNS: * The found work or NULL if not found. */ struct callback_head * task_work_cancel_func(struct task_struct *task, task_work_func_t func) { return task_work_cancel_match(task, task_work_func_match, func); } static bool task_work_match(struct callback_head *cb, void *data) { return cb == data; } /** * task_work_cancel - cancel a pending work added by task_work_add() * @task: the task which should execute the work * @cb: the callback to remove if queued * * Remove a callback from a task's queue if queued. * * RETURNS: * True if the callback was queued and got cancelled, false otherwise. */ bool task_work_cancel(struct task_struct *task, struct callback_head *cb) { struct callback_head *ret; ret = task_work_cancel_match(task, task_work_match, cb); return ret == cb; } /** * task_work_run - execute the works added by task_work_add() * * Flush the pending works. Should be used by the core kernel code. * Called before the task returns to the user-mode or stops, or when * it exits. In the latter case task_work_add() can no longer add the * new work after task_work_run() returns. */ void task_work_run(void) { struct task_struct *task = current; struct callback_head *work, *head, *next; for (;;) { /* * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ work = READ_ONCE(task->task_works); do { head = NULL; if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; /* * Synchronize with task_work_cancel_match(). It can not remove * the first entry == work, cmpxchg(task_works) must fail. * But it can remove another entry from the ->next list. */ raw_spin_lock_irq(&task->pi_lock); raw_spin_unlock_irq(&task->pi_lock); do { next = work->next; work->func(work); work = next; cond_resched(); } while (work); } }
33 33 33 33 33 35 35 2 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 5 33 33 33 33 33 33 33 33 33 32 33 33 33 2 37 35 2 4 2 4 4 33 33 33 33 33 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/signal.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson * * 2003-06-02 Jim Houston - Concurrent Computer Corp. * Changes to use preallocated sigqueue structures * to allow signals to be sent reliably. */ #include <linux/slab.h> #include <linux/export.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/user.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> #include <linux/ratelimit.h> #include <linux/task_work.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/uprobes.h> #include <linux/compat.h> #include <linux/cn_proc.h> #include <linux/compiler.h> #include <linux/posix-timers.h> #include <linux/cgroup.h> #include <linux/audit.h> #include <linux/sysctl.h> #include <uapi/linux/pidfd.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> #include <asm/param.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> #include <asm/cacheflush.h> #include <asm/syscall.h> /* for syscall_get_* */ #include "time/posix-timers.h" /* * SLAB caches for signal bits. */ static struct kmem_cache *sigqueue_cachep; int print_fatal_signals __read_mostly; static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; } static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig)); } static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); /* SIGKILL and SIGSTOP may not be sent to the global init */ if (unlikely(is_global_init(t) && sig_kernel_only(sig))) return true; if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) return true; /* Only allow kernel generated signals to this kthread */ if (unlikely((t->flags & PF_KTHREAD) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; return sig_handler_ignored(handler, sig); } static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the * signal handler may change by the time it is * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return false; /* * Tracers may want to know about even ignored signal unless it * is SIGKILL which can't be reported anyway but can be ignored * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) return false; return sig_task_ignored(t, sig, force); } /* * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; switch (_NSIG_WORDS) { default: for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) ready |= signal->sig[i] &~ blocked->sig[i]; break; case 4: ready = signal->sig[3] &~ blocked->sig[3]; ready |= signal->sig[2] &~ blocked->sig[2]; ready |= signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 2: ready = signal->sig[1] &~ blocked->sig[1]; ready |= signal->sig[0] &~ blocked->sig[0]; break; case 1: ready = signal->sig[0] &~ blocked->sig[0]; } return ready != 0; } #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked) || cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ return false; } void recalc_sigpending(void) { if (!recalc_sigpending_tsk(current) && !freezing(current)) { if (unlikely(test_thread_flag(TIF_SIGPENDING))) clear_thread_flag(TIF_SIGPENDING); } } EXPORT_SYMBOL(recalc_sigpending); void calculate_sigpending(void) { /* Have any signals or users of TIF_SIGPENDING been delayed * until after fork? */ spin_lock_irq(&current->sighand->siglock); set_tsk_thread_flag(current, TIF_SIGPENDING); recalc_sigpending(); spin_unlock_irq(&current->sighand->siglock); } /* Given the mask, find the first available signal that should be serviced. */ #define SYNCHRONOUS_MASK \ (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS)) int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; int sig = 0; s = pending->signal.sig; m = mask->sig; /* * Handle the first word specially: it contains the * synchronous signals that need to be dequeued first. */ x = *s &~ *m; if (x) { if (x & SYNCHRONOUS_MASK) x &= SYNCHRONOUS_MASK; sig = ffz(~x) + 1; return sig; } switch (_NSIG_WORDS) { default: for (i = 1; i < _NSIG_WORDS; ++i) { x = *++s &~ *++m; if (!x) continue; sig = ffz(~x) + i*_NSIG_BPW + 1; break; } break; case 2: x = s[1] &~ m[1]; if (!x) break; sig = ffz(~x) + _NSIG_BPW + 1; break; case 1: /* Nothing to do */ break; } return sig; } static inline void print_dropped_signal(int sig) { static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); if (!print_fatal_signals) return; if (!__ratelimit(&ratelimit_state)) return; pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", current->comm, current->pid, sig); } /** * task_set_jobctl_pending - set jobctl pending bits * @task: target task * @mask: pending bits to set * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is * cleared. If @task is already being killed or exiting, this function * becomes noop. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if @mask is set, %false if made noop because @task was dying. */ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) return false; if (mask & JOBCTL_STOP_SIGMASK) task->jobctl &= ~JOBCTL_STOP_SIGMASK; task->jobctl |= mask; return true; } /** * task_clear_jobctl_trapping - clear jobctl trapping bit * @task: target task * * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. * Clear it and wake up the ptracer. Note that we don't need any further * locking. @task->siglock guarantees that @task->parent points to the * ptracer. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_trapping(struct task_struct *task) { if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { task->jobctl &= ~JOBCTL_TRAPPING; smp_mb(); /* advised by wake_up_bit() */ wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); } } /** * task_clear_jobctl_pending - clear jobctl pending bits * @task: target task * @mask: pending bits to clear * * Clear @mask from @task->jobctl. @mask must be subset of * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other * STOP bits are cleared together. * * If clearing of @mask leaves no stop or trap pending, this function calls * task_clear_jobctl_trapping(). * * CONTEXT: * Must be called with @task->sighand->siglock held. */ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask) { BUG_ON(mask & ~JOBCTL_PENDING_MASK); if (mask & JOBCTL_STOP_PENDING) mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; task->jobctl &= ~mask; if (!(task->jobctl & JOBCTL_PENDING_MASK)) task_clear_jobctl_trapping(task); } /** * task_participate_group_stop - participate in a group stop * @task: task participating in a group stop * * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. * Group stop states are cleared and the group stop count is consumed if * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group * stop, the appropriate `SIGNAL_*` flags are set. * * CONTEXT: * Must be called with @task->sighand->siglock held. * * RETURNS: * %true if group stop completion should be notified to the parent, %false * otherwise. */ static bool task_participate_group_stop(struct task_struct *task) { struct signal_struct *sig = task->signal; bool consume = task->jobctl & JOBCTL_STOP_CONSUME; WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); if (!consume) return false; if (!WARN_ON_ONCE(sig->group_stop_count == 0)) sig->group_stop_count--; /* * Tell the caller to notify completion iff we are entering into a * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; } void task_join_group_stop(struct task_struct *task) { unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; struct signal_struct *sig = current->signal; if (sig->group_stop_count) { sig->group_stop_count++; mask |= JOBCTL_STOP_CONSUME; } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) return; /* Have the new thread join an on-going signal group stop */ task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig, int override_rlimit) { struct ucounts *ucounts; long sigpending; /* * Protect access to @t credentials. This can go away when all * callers hold rcu read lock. * * NOTE! A pending signal will hold on to the user refcount, * and we get/put the refcount only when the sigpending count * changes from/to zero. */ rcu_read_lock(); ucounts = task_ucounts(t); sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); print_dropped_signal(sig); return NULL; } return ucounts; } static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts, const unsigned int sigqueue_flags) { INIT_LIST_HEAD(&q->list); q->flags = sigqueue_flags; q->ucounts = ucounts; } /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, int override_rlimit) { struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit); struct sigqueue *q; if (!ucounts) return NULL; q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); if (!q) { dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); return NULL; } __sigqueue_init(q, ucounts, 0); return q; } static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) { posixtimer_sigqueue_putref(q); return; } if (q->ucounts) { dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; } kmem_cache_free(sigqueue_cachep, q); } void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; sigemptyset(&queue->signal); while (!list_empty(&queue->list)) { q = list_entry(queue->list.next, struct sigqueue , list); list_del_init(&q->list); __sigqueue_free(q); } } /* * Flush all pending signals for this kthread. */ void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); clear_tsk_thread_flag(t, TIF_SIGPENDING); flush_sigqueue(&t->pending); flush_sigqueue(&t->signal->shared_pending); spin_unlock_irqrestore(&t->sighand->siglock, flags); } EXPORT_SYMBOL(flush_signals); void ignore_signals(struct task_struct *t) { int i; for (i = 0; i < _NSIG; ++i) t->sighand->action[i].sa.sa_handler = SIG_IGN; flush_signals(t); } /* * Flush all handlers for a task. */ void flush_signal_handlers(struct task_struct *t, int force_default) { int i; struct k_sigaction *ka = &t->sighand->action[0]; for (i = _NSIG ; i != 0 ; i--) { if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; #ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); ka++; } } bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) return true; if (handler != SIG_IGN && handler != SIG_DFL) return false; /* If dying, we handle all new signals by ignoring them */ if (fatal_signal_pending(tsk)) return false; /* if ptraced, let the tracer determine */ return !tsk->ptrace; } static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { struct sigqueue *q, *first = NULL; /* * Collect the siginfo appropriate to this signal. Check if * there is another siginfo for the same signal. */ list_for_each_entry(q, &list->list, list) { if (q->info.si_signo == sig) { if (first) goto still_pending; first = q; } } sigdelset(&list->signal, sig); if (first) { still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); /* * posix-timer signals are preallocated and freed when the last * reference count is dropped in posixtimer_deliver_signal() or * immediately on timer deletion when the signal is not pending. * Spare the extra round through __sigqueue_free() which is * ignoring preallocated signals. */ if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER))) *timer_sigq = first; else __sigqueue_free(first); } else { /* * Ok, it wasn't in the queue. This must be * a fast-pathed signal or we must have been * out of queue space. So zero out the info. */ clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = SI_USER; info->si_pid = 0; info->si_uid = 0; } } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, kernel_siginfo_t *info, struct sigqueue **timer_sigq) { int sig = next_signal(pending, mask); if (sig) collect_signal(sig, pending, info, timer_sigq); return sig; } /* * Try to dequeue a signal. If a deliverable signal is found fill in the * caller provided siginfo and return the signal number. Otherwise return * 0. */ int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type) { struct task_struct *tsk = current; struct sigqueue *timer_sigq; int signr; lockdep_assert_held(&tsk->sighand->siglock); again: *type = PIDTYPE_PID; timer_sigq = NULL; signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq); if (!signr) { *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &timer_sigq); if (unlikely(signr == SIGALRM)) posixtimer_rearm_itimer(tsk); } recalc_sigpending(); if (!signr) return 0; if (unlikely(sig_kernel_stop(signr))) { /* * Set a marker that we have dequeued a stop signal. Our * caller might release the siglock and then the pending * stop signal it is about to process is no longer in the * pending bitmasks, but must still be cleared by a SIGCONT * (and overruled by a SIGKILL). So those cases clear this * shared flag after we've set it. Note that this flag may * remain set after the signal we return is ignored or * handled. That doesn't matter because its only purpose * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) { if (!posixtimer_deliver_signal(info, timer_sigq)) goto again; } return signr; } EXPORT_SYMBOL_GPL(dequeue_signal); static int dequeue_synchronous_signal(kernel_siginfo_t *info) { struct task_struct *tsk = current; struct sigpending *pending = &tsk->pending; struct sigqueue *q, *sync = NULL; /* * Might a synchronous signal be in the queue? */ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK)) return 0; /* * Return the first synchronous signal in the queue. */ list_for_each_entry(q, &pending->list, list) { /* Synchronous signals have a positive si_code */ if ((q->info.si_code > SI_USER) && (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { sync = q; goto next; } } return 0; next: /* * Check if there is another siginfo for the same signal. */ list_for_each_entry_continue(q, &pending->list, list) { if (q->info.si_signo == sync->info.si_signo) goto still_pending; } sigdelset(&pending->signal, sync->info.si_signo); recalc_sigpending(); still_pending: list_del_init(&sync->list); copy_siginfo(info, &sync->info); __sigqueue_free(sync); return info->si_signo; } /* * Tell a process that it has a new active signal.. * * NOTE! we rely on the previous spin_lock to * lock interrupts for us! We can only be called with * "siglock" held, and the local interrupt must * have been disabled when that got acquired! * * No need to set need_resched since signal event passing * goes through ->blocked */ void signal_wake_up_state(struct task_struct *t, unsigned int state) { lockdep_assert_held(&t->sighand->siglock); set_tsk_thread_flag(t, TIF_SIGPENDING); /* * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q); static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q) { if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER)) __sigqueue_free(q); else posixtimer_sig_ignore(tsk, q); } /* Remove signals in mask from the pending set and queue. */ static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; lockdep_assert_held(&p->sighand->siglock); sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); sigqueue_free_ignored(p, q); } } } static inline int is_si_special(const struct kernel_siginfo *info) { return info <= SEND_SIG_PRIV; } static inline bool si_fromuser(const struct kernel_siginfo *info) { return info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)); } /* * called with RCU read lock from check_kill_permission() */ static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); return uid_eq(cred->euid, tcred->suid) || uid_eq(cred->euid, tcred->uid) || uid_eq(cred->uid, tcred->suid) || uid_eq(cred->uid, tcred->uid) || ns_capable(tcred->user_ns, CAP_KILL); } /* * Bad permissions for sending the signal * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct kernel_siginfo *info, struct task_struct *t) { struct pid *sid; int error; if (!valid_signal(sig)) return -EINVAL; if (!si_fromuser(info)) return 0; error = audit_signal_info(sig, t); /* Let audit system see the signal */ if (error) return error; if (!same_thread_group(current, t) && !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); /* * We don't return the error if sid == NULL. The * task was unhashed, the caller must notice this. */ if (!sid || sid == task_session(current)) break; fallthrough; default: return -EPERM; } } return security_task_kill(t, info, sig, NULL); } /** * ptrace_trap_notify - schedule trap to notify ptracer * @t: tracee wanting to notify tracer * * This function schedules sticky ptrace trap which is cleared on the next * TRAP_STOP to notify ptracer of an event. @t must have been seized by * ptracer. * * If @t is running, STOP trap will be taken. If trapped for STOP and * ptracer is listening for events, tracee is woken up so that it can * re-trap for the new event. If trapped otherwise, STOP trap will be * eventually taken without returning to userland after the existing traps * are finished by PTRACE_CONT. * * CONTEXT: * Must be called with @task->sighand->siglock held. */ static void ptrace_trap_notify(struct task_struct *t) { WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); lockdep_assert_held(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation * time regardless of blocking, ignoring, or handling. This does the * actual continuing for SIGCONT, but not the actual stopping for stop * signals. The process stop is done as a signal action for SIG_DFL. * * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ static bool prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; sigset_t flush; if (signal->flags & SIGNAL_GROUP_EXIT) { if (signal->core_state) return sig == SIGKILL; /* * The process is in the middle of dying, drop the signal. */ return false; } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. */ siginitset(&flush, sigmask(SIGCONT)); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &flush, &t->pending); } else if (sig == SIGCONT) { unsigned int why; /* * Remove all stop signals from all queues, wake all threads. */ siginitset(&flush, SIG_KERNEL_STOP_MASK); flush_sigqueue_mask(p, &flush, &signal->shared_pending); for_each_thread(p, t) { flush_sigqueue_mask(p, &flush, &t->pending); task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); if (likely(!(t->ptrace & PT_SEIZED))) { t->jobctl &= ~JOBCTL_STOPPED; wake_up_state(t, __TASK_STOPPED); } else ptrace_trap_notify(t); } /* * Notify the parent with CLD_CONTINUED if we were stopped. * * If we were in the middle of a group stop, we pretend it * was already finished, and then continued. Since SIGCHLD * doesn't queue we report only CLD_STOPPED, as if the next * CLD_CONTINUED was dropped. */ why = 0; if (signal->flags & SIGNAL_STOP_STOPPED) why |= SIGNAL_CLD_CONTINUED; else if (signal->group_stop_count) why |= SIGNAL_CLD_STOPPED; if (why) { /* * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal(). */ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } } return !sig_ignored(p, sig, force); } /* * Test if P wants to take SIG. After we've checked all threads with this, * it's equivalent to finding no threads not blocking SIG. Any threads not * blocking SIG were ruled out because they are not running and already * have pending signals. Such threads will dequeue from the shared queue * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) return false; if (p->flags & PF_EXITING) return false; if (sig == SIGKILL) return true; if (task_is_stopped_or_traced(p)) return false; return task_curr(p) || !task_sigpending(p); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) { struct signal_struct *signal = p->signal; struct task_struct *t; /* * Now find a thread we can wake up to take the signal off the queue. * * Try the suggested task first (may or may not be the main thread). */ if (wants_signal(sig, p)) t = p; else if ((type == PIDTYPE_PID) || thread_group_empty(p)) /* * There is just one thread and it does not need to be woken. * It will dequeue unblocked signals before it runs again. */ return; else { /* * Otherwise try to find a suitable thread. */ t = signal->curr_target; while (!wants_signal(sig, t)) { t = next_thread(t); if (t == signal->curr_target) /* * No thread needs to be woken. * Any eligible threads will see * the signal in the queue soon. */ return; } signal->curr_target = t; } /* * Found a killable thread. If the signal will be fatal, * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ if (!sig_kernel_coredump(sig)) { /* * Start a group exit and wake everybody up. * This way we don't have other threads * running and doing things after a slower * thread has the fatal signal pending. */ signal->flags = SIGNAL_GROUP_EXIT; signal->group_exit_code = sig; signal->group_stop_count = 0; __for_each_thread(signal, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return; } } /* * The signal is already in the shared-pending queue. * Tell the chosen thread to wake up and dequeue it. */ signal_wake_up(t, sig == SIGKILL); return; } static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } static int __send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; int override_rlimit; int ret = 0, result; lockdep_assert_held(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; /* * Short-circuit ignored signals and support queuing * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) goto ret; result = TRACE_SIGNAL_DELIVERED; /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* * Real-time signals must be queued if sent by sigqueue, or * some other real-time mechanism. It is implementation * defined whether kill() does so. We attempt to do so, on * the principle of least surprise, but since kill is not * allowed to fail with EAGAIN when low on memory we just * make sure at least one signal gets delivered and don't * pass on the info struct. */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else override_rlimit = 0; q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); rcu_read_lock(); q->info.si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), current_uid()); rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; q->info.si_pid = 0; q->info.si_uid = 0; break; default: copy_siginfo(&q->info, info); break; } } else if (!is_si_special(info) && sig >= SIGRTMIN && info->si_code != SI_USER) { /* * Queue overflow, abort. We may abort if the * signal was rt and sent by user using something * other than kill(). */ result = TRACE_SIGNAL_OVERFLOW_FAIL; ret = -EAGAIN; goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ result = TRACE_SIGNAL_LOSE_INFO; } out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); /* Let multiprocess signals appear after on-going forks */ if (type > PIDTYPE_TGID) { struct multiprocess_signals *delayed; hlist_for_each_entry(delayed, &t->signal->multiprocess, node) { sigset_t *signal = &delayed->signal; /* Can't queue both a stop and a continue signal */ if (sig == SIGCONT) sigdelsetmask(signal, SIG_KERNEL_STOP_MASK); else if (sig_kernel_stop(sig)) sigdelset(signal, SIGCONT); sigaddset(signal, sig); } } complete_signal(sig, t, type); ret: trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result); return ret; } static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) { bool ret = false; switch (siginfo_layout(info->si_signo, info->si_code)) { case SIL_KILL: case SIL_CHLD: case SIL_RT: ret = true; break; case SIL_TIMER: case SIL_POLL: case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: case SIL_SYS: ret = false; break; } return ret; } int send_signal_locked(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ bool force = false; if (info == SEND_SIG_NOINFO) { /* Force if sent from an ancestor pid namespace */ force = !task_pid_nr_ns(current, task_active_pid_ns(t)); } else if (info == SEND_SIG_PRIV) { /* Don't ignore kernel generated signals */ force = true; } else if (has_si_pid_and_uid(info)) { /* SIGKILL and SIGSTOP is special or has ids */ struct user_namespace *t_user_ns; rcu_read_lock(); t_user_ns = task_cred_xxx(t, user_ns); if (current_user_ns() != t_user_ns) { kuid_t uid = make_kuid(current_user_ns(), info->si_uid); info->si_uid = from_kuid_munged(t_user_ns, uid); } rcu_read_unlock(); /* A kernel generated signal? */ force = (info->si_code == SI_KERNEL); /* From an ancestor pid namespace? */ if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { info->si_pid = 0; force = true; } } return __send_signal_locked(sig, info, t, type, force); } static void print_fatal_signal(int signr) { struct pt_regs *regs = task_pt_regs(current); struct file *exe_file; exe_file = get_task_exe_file(current); if (exe_file) { pr_info("%pD: %s: potentially unexpected fatal signal %d.\n", exe_file, current->comm, signr); fput(exe_file); } else { pr_info("%s: potentially unexpected fatal signal %d.\n", current->comm, signr); } #if defined(__i386__) && !defined(__arch_um__) pr_info("code at %08lx: ", regs->ip); { int i; for (i = 0; i < 16; i++) { unsigned char insn; if (get_user(insn, (unsigned char *)(regs->ip + i))) break; pr_cont("%02x ", insn); } } pr_cont("\n"); #endif preempt_disable(); show_regs(regs); preempt_enable(); } static int __init setup_print_fatal_signals(char *str) { get_option (&str, &print_fatal_signals); return 1; } __setup("print-fatal-signals=", setup_print_fatal_signals); int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { unsigned long flags; int ret = -ESRCH; if (lock_task_sighand(p, &flags)) { ret = send_signal_locked(sig, info, p, type); unlock_task_sighand(p, &flags); } return ret; } enum sig_handler { HANDLER_CURRENT, /* If reachable use the current handler */ HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */ HANDLER_EXIT, /* Only visible as the process exit code */ }; /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. * * Note: If we unblock the signal, we always reset it to SIG_DFL, * since we do not want to have a signal handler that was blocked * be invoked when user space had explicitly blocked it. * * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ static int force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, enum sig_handler handler) { unsigned long int flags; int ret, blocked, ignored; struct k_sigaction *action; int sig = info->si_signo; spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); if (blocked || ignored || (handler != HANDLER_CURRENT)) { action->sa.sa_handler = SIG_DFL; if (handler == HANDLER_EXIT) action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) sigdelset(&t->blocked, sig); } /* * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect * debugging to leave init killable. But HANDLER_EXIT is always fatal. */ if (action->sa.sa_handler == SIG_DFL && (!t->ptrace || (handler == HANDLER_EXIT))) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = send_signal_locked(sig, info, t, PIDTYPE_PID); /* This can happen if the signal was already pending and blocked */ if (!task_sigpending(t)) signal_wake_up(t, 0); spin_unlock_irqrestore(&t->sighand->siglock, flags); return ret; } int force_sig_info(struct kernel_siginfo *info) { return force_sig_info_to_task(info, current, HANDLER_CURRENT); } /* * Nuke all other threads in the group. */ int zap_other_threads(struct task_struct *p) { struct task_struct *t; int count = 0; p->signal->group_stop_count = 0; for_other_threads(p, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); count++; /* Don't bother with already dead threads */ if (t->exit_state) continue; sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } return count; } struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; rcu_read_lock(); for (;;) { sighand = rcu_dereference(tsk->sighand); if (unlikely(sighand == NULL)) break; /* * This sighand can be already freed and even reused, but * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * * We need to ensure that tsk->sighand is still the same * after we take the lock, we can race with de_thread() or * __exit_signal(). In the latter case the next iteration * must see ->sighand == NULL. */ spin_lock_irqsave(&sighand->siglock, *flags); if (likely(sighand == rcu_access_pointer(tsk->sighand))) break; spin_unlock_irqrestore(&sighand->siglock, *flags); } rcu_read_unlock(); return sighand; } #ifdef CONFIG_LOCKDEP void lockdep_assert_task_sighand_held(struct task_struct *task) { struct sighand_struct *sighand; rcu_read_lock(); sighand = rcu_dereference(task->sighand); if (sighand) lockdep_assert_held(&sighand->siglock); else WARN_ON_ONCE(1); rcu_read_unlock(); } #endif /* * send signal info to all the members of a thread group or to the * individual thread if type == PIDTYPE_PID. */ int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type) { int ret; rcu_read_lock(); ret = check_kill_permission(sig, info, p); rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, type); return ret; } /* * __kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) * - the caller must hold at least a readlock on tasklist_lock */ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int ret = -ESRCH; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID); /* * If group_send_sig_info() succeeds at least once ret * becomes 0 and after that the code below has no effect. * Otherwise we return the last err or -ESRCH if this * process group is empty. */ if (ret) ret = err; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return ret; } static int kill_pid_info_type(int sig, struct kernel_siginfo *info, struct pid *pid, enum pid_type type) { int error = -ESRCH; struct task_struct *p; for (;;) { rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (p) error = group_send_sig_info(sig, info, p, type); rcu_read_unlock(); if (likely(!p || error != -ESRCH)) return error; /* * The task was unhashed in between, try again. If it * is dead, pid_task() will return NULL, if we race with * de_thread() it will find the new leader. */ } } int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid) { return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID); } static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid) { int error; rcu_read_lock(); error = kill_pid_info(sig, info, find_vpid(pid)); rcu_read_unlock(); return error; } static inline bool kill_as_cred_perm(const struct cred *cred, struct task_struct *target) { const struct cred *pcred = __task_cred(target); return uid_eq(cred->euid, pcred->suid) || uid_eq(cred->euid, pcred->uid) || uid_eq(cred->uid, pcred->suid) || uid_eq(cred->uid, pcred->uid); } /* * The usb asyncio usage of siginfo is wrong. The glibc support * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. * AKA after the generic fields: * kernel_pid_t si_pid; * kernel_uid32_t si_uid; * sigval_t si_value; * * Unfortunately when usb generates SI_ASYNCIO it assumes the layout * after the generic fields is: * void __user *si_addr; * * This is a practical problem when there is a 64bit big endian kernel * and a 32bit userspace. As the 32bit address will encoded in the low * 32bits of the pointer. Those low 32bits will be stored at higher * address than appear in a 32 bit pointer. So userspace will not * see the address it was expecting for it's completions. * * There is nothing in the encoding that can allow * copy_siginfo_to_user32 to detect this confusion of formats, so * handle this by requiring the caller of kill_pid_usb_asyncio to * notice when this situration takes place and to store the 32bit * pointer in sival_int, instead of sival_addr of the sigval_t addr * parameter. */ int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *pid, const struct cred *cred) { struct kernel_siginfo info; struct task_struct *p; unsigned long flags; int ret = -EINVAL; if (!valid_signal(sig)) return ret; clear_siginfo(&info); info.si_signo = sig; info.si_errno = errno; info.si_code = SI_ASYNCIO; *((sigval_t *)&info.si_pid) = addr; rcu_read_lock(); p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; } if (!kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } ret = security_task_kill(p, &info, sig, cred); if (ret) goto out_unlock; if (sig) { if (lock_task_sighand(p, &flags)) { ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; } out_unlock: rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); /* * kill_something_info() interprets pid in interesting ways just like kill(2). * * POSIX specifies that kill(-1,sig) is unspecified, but what we have * is probably wrong. Should make it like BSD or SYSV. */ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid) { int ret; if (pid > 0) return kill_proc_info(sig, info, pid); /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */ if (pid == INT_MIN) return -ESRCH; read_lock(&tasklist_lock); if (pid != -1) { ret = __kill_pgrp_info(sig, info, pid ? find_vpid(-pid) : task_pgrp(current)); } else { int retval = 0, count = 0; struct task_struct * p; for_each_process(p) { if (task_pid_vnr(p) > 1 && !same_thread_group(p, current)) { int err = group_send_sig_info(sig, info, p, PIDTYPE_MAX); ++count; if (err != -EPERM) retval = err; } } ret = count ? retval : -ESRCH; } read_unlock(&tasklist_lock); return ret; } /* * These are for backward compatibility with the rest of the kernel source. */ int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). */ if (!valid_signal(sig)) return -EINVAL; return do_send_sig_info(sig, info, p, PIDTYPE_PID); } EXPORT_SYMBOL(send_sig_info); #define __si_special(priv) \ ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) int send_sig(int sig, struct task_struct *p, int priv) { return send_sig_info(sig, __si_special(priv), p); } EXPORT_SYMBOL(send_sig); void force_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } EXPORT_SYMBOL(force_sig); void force_fatal_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } void force_exit_sig(int sig) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; force_sig_info_to_task(&info, current, HANDLER_EXIT); } /* * When things go south during signal handling, we * will force a SIGSEGV. And if the signal that caused * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ void force_sigsegv(int sig) { if (sig == SIGSEGV) force_fatal_sig(SIGSEGV); else force_sig(SIGSEGV); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } int force_sig_fault(int sig, int code, void __user *addr) { return force_sig_fault_to_task(sig, code, addr, current); } int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; return send_sig_info(info.si_signo, &info, t); } int force_sig_mceerr(int code, void __user *addr, short lsb) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return force_sig_info(&info); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) { struct kernel_siginfo info; WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); clear_siginfo(&info); info.si_signo = SIGBUS; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; return send_sig_info(info.si_signo, &info, t); } EXPORT_SYMBOL(send_sig_mceerr); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_BNDERR; info.si_addr = addr; info.si_lower = lower; info.si_upper = upper; return force_sig_info(&info); } #ifdef SEGV_PKUERR int force_sig_pkuerr(void __user *addr, u32 pkey) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSEGV; info.si_errno = 0; info.si_code = SEGV_PKUERR; info.si_addr = addr; info.si_pkey = pkey; return force_sig_info(&info); } #endif int send_sig_perf(void __user *addr, u32 type, u64 sig_data) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_PERF; info.si_addr = addr; info.si_perf_data = sig_data; info.si_perf_type = type; /* * Signals generated by perf events should not terminate the whole * process if SIGTRAP is blocked, however, delivering the signal * asynchronously is better than not delivering at all. But tell user * space if the signal was asynchronous, so it can clearly be * distinguished from normal synchronous ones. */ info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ? TRAP_PERF_FLAG_ASYNC : 0; return send_sig_info(info.si_signo, &info, current); } /** * force_sig_seccomp - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland * @reason: filter-supplied reason code to send to userland (via si_errno) * @force_coredump: true to trigger a coredump * * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. */ int force_sig_seccomp(int syscall, int reason, bool force_coredump) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGSYS; info.si_code = SYS_SECCOMP; info.si_call_addr = (void __user *)KSTK_EIP(current); info.si_errno = reason; info.si_arch = syscall_get_arch(current); info.si_syscall = syscall; return force_sig_info_to_task(&info, current, force_coredump ? HANDLER_EXIT : HANDLER_CURRENT); } /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ int force_sig_ptrace_errno_trap(int errno, void __user *addr) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = errno; info.si_code = TRAP_HWBKPT; info.si_addr = addr; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return force_sig_info(&info); } /* For the rare architectures that include trap information using * si_trapno. */ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t) { struct kernel_siginfo info; clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = code; info.si_addr = addr; info.si_trapno = trapno; return send_sig_info(info.si_signo, &info, t); } static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; read_lock(&tasklist_lock); ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); return ret; } int kill_pgrp(struct pid *pid, int sig, int priv) { return kill_pgrp_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) { return kill_pid_info(sig, __si_special(priv), pid); } EXPORT_SYMBOL(kill_pid); #ifdef CONFIG_POSIX_TIMERS /* * These functions handle POSIX timer signals. POSIX timers use * preallocated sigqueue structs for sending signals. */ static void __flush_itimer_signals(struct sigpending *pending) { sigset_t signal, retain; struct sigqueue *q, *n; signal = pending->signal; sigemptyset(&retain); list_for_each_entry_safe(q, n, &pending->list, list) { int sig = q->info.si_signo; if (likely(q->info.si_code != SI_TIMER)) { sigaddset(&retain, sig); } else { sigdelset(&signal, sig); list_del_init(&q->list); __sigqueue_free(q); } } sigorsets(&pending->signal, &signal, &retain); } void flush_itimer_signals(void) { struct task_struct *tsk = current; guard(spinlock_irqsave)(&tsk->sighand->siglock); __flush_itimer_signals(&tsk->pending); __flush_itimer_signals(&tsk->signal->shared_pending); } bool posixtimer_init_sigqueue(struct sigqueue *q) { struct ucounts *ucounts = sig_get_ucounts(current, -1, 0); if (!ucounts) return false; clear_siginfo(&q->info); __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC); return true; } static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type) { struct sigpending *pending; int sig = q->info.si_signo; signalfd_notify(t, sig); pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, type); } /* * This function is used by POSIX timers to deliver a timer signal. * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID * set), the signal must be delivered to the specific thread (queues * into t->pending). * * Where type is not PIDTYPE_PID, signals must be delivered to the * process. In this case, prefer to deliver to current if it is in * the same thread group as the target process and its sighand is * stable, which avoids unnecessarily waking up a potentially idle task. */ static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr) { struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type); if (t && tmr->it_pid_type != PIDTYPE_PID && same_thread_group(t, current) && !current->exit_state) t = current; return t; } void posixtimer_send_sigqueue(struct k_itimer *tmr) { struct sigqueue *q = &tmr->sigq; int sig = q->info.si_signo; struct task_struct *t; unsigned long flags; int result; guard(rcu)(); t = posixtimer_get_target(tmr); if (!t) return; if (!likely(lock_task_sighand(t, &flags))) return; /* * Update @tmr::sigqueue_seq for posix timer signals with sighand * locked to prevent a race against dequeue_signal(). */ tmr->it_sigqueue_seq = tmr->it_signal_seq; /* * Set the signal delivery status under sighand lock, so that the * ignored signal handling can distinguish between a periodic and a * non-periodic timer. */ tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING; if (!prepare_signal(sig, t, false)) { result = TRACE_SIGNAL_IGNORED; if (!list_empty(&q->list)) { /* * The signal was ignored and blocked. The timer * expiry queued it because blocked signals are * queued independent of the ignored state. * * The unblocking set SIGPENDING, but the signal * was not yet dequeued from the pending list. * So prepare_signal() sees unblocked and ignored, * which ends up here. Leave it queued like a * regular signal. * * The same happens when the task group is exiting * and the signal is already queued. * prepare_signal() treats SIGNAL_GROUP_EXIT as * ignored independent of its queued state. This * gets cleaned up in __exit_signal(). */ goto out; } /* Periodic timers with SIG_IGN are queued on the ignored list */ if (tmr->it_sig_periodic) { /* * Already queued means the timer was rearmed after * the previous expiry got it on the ignore list. * Nothing to do for that case. */ if (hlist_unhashed(&tmr->ignored_list)) { /* * Take a signal reference and queue it on * the ignored list. */ posixtimer_sigqueue_getref(q); posixtimer_sig_ignore(t, q); } } else if (!hlist_unhashed(&tmr->ignored_list)) { /* * Covers the case where a timer was periodic and * then the signal was ignored. Later it was rearmed * as oneshot timer. The previous signal is invalid * now, and this oneshot signal has to be dropped. * Remove it from the ignored list and drop the * reference count as the signal is not longer * queued. */ hlist_del_init(&tmr->ignored_list); posixtimer_putref(tmr); } goto out; } if (unlikely(!list_empty(&q->list))) { /* This holds a reference count already */ result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } /* * If the signal is on the ignore list, it got blocked after it was * ignored earlier. But nothing lifted the ignore. Move it back to * the pending list to be consistent with the regular signal * handling. This already holds a reference count. * * If it's not on the ignore list acquire a reference count. */ if (likely(hlist_unhashed(&tmr->ignored_list))) posixtimer_sigqueue_getref(q); else hlist_del_init(&tmr->ignored_list); posixtimer_queue_sigqueue(q, t, tmr->it_pid_type); result = TRACE_SIGNAL_DELIVERED; out: trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result); unlock_task_sighand(t, &flags); } static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { struct k_itimer *tmr = container_of(q, struct k_itimer, sigq); /* * If the timer is marked deleted already or the signal originates * from a non-periodic timer, then just drop the reference * count. Otherwise queue it on the ignored list. */ if (posixtimer_valid(tmr) && tmr->it_sig_periodic) hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers); else posixtimer_putref(tmr); } static void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { struct hlist_head *head = &tsk->signal->ignored_posix_timers; struct hlist_node *tmp; struct k_itimer *tmr; if (likely(hlist_empty(head))) return; /* * Rearming a timer with sighand lock held is not possible due to * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and * let the signal delivery path deal with it whether it needs to be * rearmed or not. This cannot be decided here w/o dropping sighand * lock and creating a loop retry horror show. */ hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) { struct task_struct *target; /* * tmr::sigq.info.si_signo is immutable, so accessing it * without holding tmr::it_lock is safe. */ if (tmr->sigq.info.si_signo != sig) continue; hlist_del_init(&tmr->ignored_list); /* This should never happen and leaks a reference count */ if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list))) continue; /* * Get the target for the signal. If target is a thread and * has exited by now, drop the reference count. */ guard(rcu)(); target = posixtimer_get_target(tmr); if (target) posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type); else posixtimer_putref(tmr); } } #else /* CONFIG_POSIX_TIMERS */ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { } static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { } #endif /* !CONFIG_POSIX_TIMERS */ void do_notify_pidfd(struct task_struct *task) { struct pid *pid = task_pid(task); WARN_ON(task->exit_state == 0); __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0, poll_to_key(EPOLLIN | EPOLLRDNORM)); } /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * * Returns true if our parent ignored us and so we've switched to * self-reaping. */ bool do_notify_parent(struct task_struct *tsk, int sig) { struct kernel_siginfo info; unsigned long flags; struct sighand_struct *psig; bool autoreap = false; u64 utime, stime; WARN_ON_ONCE(sig == -1); /* do_notify_parent_cldstop should have been called instead. */ WARN_ON_ONCE(task_is_stopped_or_traced(tsk)); WARN_ON_ONCE(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); /* ptraced, or group-leader without sub-threads */ do_notify_pidfd(tsk); if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. * Check if it has changed security domain. */ if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id)) sig = SIGCHLD; } clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; /* * We are under tasklist_lock here so our parent is tied to * us and cannot change. * * task_active_pid_ns will always return the same pid namespace * until a task passes through release_task. * * write_lock() currently calls preempt_disable() which is the * same as rcu_read_lock(), but according to Oleg, this is not * correct to rely on this */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent)); info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime); info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) info.si_code = CLD_DUMPED; else if (tsk->exit_code & 0x7f) info.si_code = CLD_KILLED; else { info.si_code = CLD_EXITED; info.si_status = tsk->exit_code >> 8; } psig = tsk->parent->sighand; spin_lock_irqsave(&psig->siglock, flags); if (!tsk->ptrace && sig == SIGCHLD && (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { /* * We are exiting and our parent doesn't care. POSIX.1 * defines special semantics for setting SIGCHLD to SIG_IGN * or setting the SA_NOCLDWAIT flag: we should be reaped * automatically and not left for our parent's wait4 call. * Rather than having the parent do it as a magic kind of * signal handler, we just set this to tell do_exit that we * can be cleaned up without becoming a zombie. Note that * we still call __wake_up_parent in this case, because a * blocked sys_wait4 might now return -ECHILD. * * Whether we send SIGCHLD or not for SA_NOCLDWAIT * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ autoreap = true; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. */ if (valid_signal(sig) && sig) __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); return autoreap; } /** * do_notify_parent_cldstop - notify parent of stopped/continued state change * @tsk: task reporting the state change * @for_ptracer: the notification is for ptracer * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report * * Notify @tsk's parent that the stopped/continued state has changed. If * @for_ptracer is %false, @tsk's group leader notifies to its real parent. * If %true, @tsk reports to @tsk->parent which should be the ptracer. * * CONTEXT: * Must be called with tasklist_lock at least read locked. */ static void do_notify_parent_cldstop(struct task_struct *tsk, bool for_ptracer, int why) { struct kernel_siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; u64 utime, stime; if (for_ptracer) { parent = tsk->parent; } else { tsk = tsk->group_leader; parent = tsk->real_parent; } clear_siginfo(&info); info.si_signo = SIGCHLD; info.si_errno = 0; /* * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent)); info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk)); rcu_read_unlock(); task_cputime(tsk, &utime, &stime); info.si_utime = nsec_to_clock_t(utime); info.si_stime = nsec_to_clock_t(stime); info.si_code = why; switch (why) { case CLD_CONTINUED: info.si_status = SIGCONT; break; case CLD_STOPPED: info.si_status = tsk->signal->group_exit_code & 0x7f; break; case CLD_TRAPPED: info.si_status = tsk->exit_code & 0x7f; break; default: BUG(); } sighand = parent->sighand; spin_lock_irqsave(&sighand->siglock, flags); if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID); /* * Even if SIGCHLD is not generated, we must wake up wait4 calls. */ __wake_up_parent(tsk, parent); spin_unlock_irqrestore(&sighand->siglock, flags); } /* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. * We always set current->last_siginfo while stopped here. * That makes it a way to test a stopped process for * being ptrace-stopped vs being job-control-stopped. * * Returns the signal the ptracer requested the code resume * with. If the code did not stop because the tracer is gone, * the stop signal remains unchanged unless clear_code. */ static int ptrace_stop(int exit_code, int why, unsigned long message, kernel_siginfo_t *info) __releases(&current->sighand->siglock) __acquires(&current->sighand->siglock) { bool gstop_done = false; if (arch_ptrace_stop_needed()) { /* * The arch code has something special to do before a * ptrace stop. This is allowed to block, e.g. for faults * on user stack pages. We can't keep the siglock while * calling arch_ptrace_stop, so we must release it now. * To preserve proper semantics, we must do this before * any signal bookkeeping like checking group_stop_count. */ spin_unlock_irq(&current->sighand->siglock); arch_ptrace_stop(); spin_lock_irq(&current->sighand->siglock); } /* * After this point ptrace_signal_wake_up or signal_wake_up * will clear TASK_TRACED if ptrace_unlink happens or a fatal * signal comes in. Handle previous ptrace_unlinks and fatal * signals here to prevent ptrace_stop sleeping in schedule. */ if (!current->ptrace || __fatal_signal_pending(current)) return exit_code; set_special_state(TASK_TRACED); current->jobctl |= JOBCTL_TRACED; /* * We're committing to trapping. TRACED should be visible before * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). * Also, transition to TRACED and updates to ->jobctl should be * atomic with respect to siglock and should be done after the arch * hook as siglock is released and regrabbed across it. * * TRACER TRACEE * * ptrace_attach() * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) * do_wait() * set_current_state() smp_wmb(); * ptrace_do_wait() * wait_task_stopped() * task_stopped_code() * [L] task_is_traced() [S] task_clear_jobctl_trapping(); */ smp_wmb(); current->ptrace_message = message; current->last_siginfo = info; current->exit_code = exit_code; /* * If @why is CLD_STOPPED, we're trapping to participate in a group * stop. Do the bookkeeping. Note that if SIGCONT was delievered * across siglock relocks since INTERRUPT was scheduled, PENDING * could be clear now. We act as if SIGCONT is received after * TASK_TRACED is entered - ignore it. */ if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) gstop_done = task_participate_group_stop(current); /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); /* entering a trap, clear TRAPPING */ task_clear_jobctl_trapping(current); spin_unlock_irq(&current->sighand->siglock); read_lock(&tasklist_lock); /* * Notify parents of the stop. * * While ptraced, there are two parents - the ptracer and * the real_parent of the group_leader. The ptracer should * know about every stop while the real parent is only * interested in the completion of group stop. The states * for the two don't interact with each other. Notify * separately unless they're gonna be duplicates. */ if (current->ptrace) do_notify_parent_cldstop(current, true, why); if (gstop_done && (!current->ptrace || ptrace_reparented(current))) do_notify_parent_cldstop(current, false, why); /* * The previous do_notify_parent_cldstop() invocation woke ptracer. * One a PREEMPTION kernel this can result in preemption requirement * which will be fulfilled after read_unlock() and the ptracer will be * put on the CPU. * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for * this task wait in schedule(). If this task gets preempted then it * remains enqueued on the runqueue. The ptracer will observe this and * then sleep for a delay of one HZ tick. In the meantime this task * gets scheduled, enters schedule() and will wait for the ptracer. * * This preemption point is not bad from a correctness point of * view but extends the runtime by one HZ tick time due to the * ptracer's sleep. The preempt-disable section ensures that there * will be no preemption between unlock and schedule() and so * improving the performance since the ptracer will observe that * the tracee is scheduled out once it gets on the CPU. * * On PREEMPT_RT locking tasklist_lock does not disable preemption. * Therefore the task can be preempted after do_notify_parent_cldstop() * before unlocking tasklist_lock so there is no benefit in doing this. * * In fact disabling preemption is harmful on PREEMPT_RT because * the spinlock_t in cgroup_enter_frozen() must not be acquired * with preemption disabled due to the 'sleeping' spinlock * substitution of RT. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable_no_resched(); schedule(); cgroup_leave_frozen(true); /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. */ spin_lock_irq(&current->sighand->siglock); exit_code = current->exit_code; current->last_siginfo = NULL; current->ptrace_message = 0; current->exit_code = 0; /* LISTENING can be set only during STOP traps, clear it */ current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN); /* * Queued signals ignored us while we were stopped for tracing. * So check for any that we should take before resuming user mode. * This sets TIF_SIGPENDING, but never clears it. */ recalc_sigpending_tsk(current); return exit_code; } static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); /* Let the debugger run. */ return ptrace_stop(exit_code, why, message, &info); } int ptrace_notify(int exit_code, unsigned long message) { int signr; BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); if (unlikely(task_work_pending(current))) task_work_run(); spin_lock_irq(&current->sighand->siglock); signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message); spin_unlock_irq(&current->sighand->siglock); return signr; } /** * do_signal_stop - handle group stop for SIGSTOP and other stop signals * @signr: signr causing group stop if initiating * * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr * and participate in it. If already set, participate in the existing * group stop. If participated in a group stop (and thus slept), %true is * returned with siglock released. * * If ptraced, this function doesn't handle stop itself. Instead, * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock * untouched. The caller must ensure that INTERRUPT trap handling takes * places afterwards. * * CONTEXT: * Must be called with @current->sighand->siglock held, which is released * on %true return. * * RETURNS: * %false if group stop is already cancelled or ptrace trap is scheduled. * %true if participated in group stop. */ static bool do_signal_stop(int signr) __releases(&current->sighand->siglock) { struct signal_struct *sig = current->signal; if (!(current->jobctl & JOBCTL_STOP_PENDING)) { unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; struct task_struct *t; /* signr will be recorded in task->jobctl for retries */ WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || unlikely(sig->flags & SIGNAL_GROUP_EXIT) || unlikely(sig->group_exec_task)) return false; /* * There is no group stop already in progress. We must * initiate one now. * * While ptraced, a task may be resumed while group stop is * still in effect and then receive a stop signal and * initiate another group stop. This deviates from the * usual behavior as two consecutive stop signals can't * cause two group stops when !ptraced. That is why we * also check !task_is_stopped(t) below. * * The condition can be distinguished by testing whether * SIGNAL_STOP_STOPPED is already set. Don't generate * group_exit_code in such case. * * This is not necessary for SIGNAL_STOP_CONTINUED because * an intervening stop signal is required to cause two * continued events regardless of ptrace. */ if (!(sig->flags & SIGNAL_STOP_STOPPED)) sig->group_exit_code = signr; sig->group_stop_count = 0; if (task_set_jobctl_pending(current, signr | gstop)) sig->group_stop_count++; for_other_threads(current, t) { /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ if (!task_is_stopped(t) && task_set_jobctl_pending(t, signr | gstop)) { sig->group_stop_count++; if (likely(!(t->ptrace & PT_SEIZED))) signal_wake_up(t, 0); else ptrace_trap_notify(t); } } } if (likely(!current->ptrace)) { int notify = 0; /* * If there are no other threads in the group, or if there * is a group stop in progress and we are the last to stop, * report to the parent. */ if (task_participate_group_stop(current)) notify = CLD_STOPPED; current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); spin_unlock_irq(&current->sighand->siglock); /* * Notify the parent of the group stop completion. Because * we're not holding either the siglock or tasklist_lock * here, ptracer may attach inbetween; however, this is for * group stop and should always be delivered to the real * parent of the group leader. The new ptracer will get * its notification when this task transitions into * TASK_TRACED. */ if (notify) { read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, notify); read_unlock(&tasklist_lock); } /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); schedule(); return true; } else { /* * While ptraced, group stop is handled by STOP trap. * Schedule it and let the caller deal with it. */ task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); return false; } } /** * do_jobctl_trap - take care of ptrace jobctl traps * * When PT_SEIZED, it's used for both group stop and explicit * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with * accompanying siginfo. If stopped, lower eight bits of exit_code contain * the stop signal; otherwise, %SIGTRAP. * * When !PT_SEIZED, it's used only for group stop trap with stop signal * number as exit_code and no siginfo. * * CONTEXT: * Must be called with @current->sighand->siglock held, which may be * released and re-acquired before returning with intervening sleep. */ static void do_jobctl_trap(void) { struct signal_struct *signal = current->signal; int signr = current->jobctl & JOBCTL_STOP_SIGMASK; if (current->ptrace & PT_SEIZED) { if (!signal->group_stop_count && !(signal->flags & SIGNAL_STOP_STOPPED)) signr = SIGTRAP; WARN_ON_ONCE(!signr); ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), CLD_STOPPED, 0); } else { WARN_ON_ONCE(!signr); ptrace_stop(signr, CLD_STOPPED, 0, NULL); } } /** * do_freezer_trap - handle the freezer jobctl trap * * Puts the task into frozen state, if only the task is not about to quit. * In this case it drops JOBCTL_TRAP_FREEZE. * * CONTEXT: * Must be called with @current->sighand->siglock held, * which is always released before returning. */ static void do_freezer_trap(void) __releases(&current->sighand->siglock) { /* * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, * let's make another loop to give it a chance to be handled. * In any case, we'll return back. */ if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != JOBCTL_TRAP_FREEZE) { spin_unlock_irq(&current->sighand->siglock); return; } /* * Now we're sure that there is no pending fatal signal and no * pending traps. Clear TIF_SIGPENDING to not get out of schedule() * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(&current->sighand->siglock); cgroup_enter_frozen(); schedule(); /* * We could've been woken by task_work, run it to clear * TIF_NOTIFY_SIGNAL. The caller will retry if necessary. */ clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) { /* * We do not check sig_kernel_stop(signr) but set this marker * unconditionally because we do not know whether debugger will * change signr. This flag has no meaning unless we are going * to stop after return from ptrace_stop(). In this case it will * be checked in do_signal_stop(), we should only stop if it was * not cleared by SIGCONT while we were sleeping. See also the * comment in dequeue_signal(). */ current->jobctl |= JOBCTL_STOP_DEQUEUED; signr = ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ if (signr == 0) return signr; /* * Update the siginfo structure if the signal has * changed. If the debugger wanted something * specific in the siginfo structure then it should * have updated *info via PTRACE_SETSIGINFO. */ if (signr != info->si_signo) { clear_siginfo(info); info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); info->si_uid = from_kuid_munged(current_user_ns(), task_uid(current->parent)); rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */ if (sigismember(&current->blocked, signr) || fatal_signal_pending(current)) { send_signal_locked(signr, info, current, type); signr = 0; } return signr; } static void hide_si_addr_tag_bits(struct ksignal *ksig) { switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { case SIL_FAULT: case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: case SIL_FAULT_PERF_EVENT: ksig->info.si_addr = arch_untagged_si_addr( ksig->info.si_addr, ksig->sig, ksig->info.si_code); break; case SIL_KILL: case SIL_TIMER: case SIL_POLL: case SIL_CHLD: case SIL_RT: case SIL_SYS: break; } } bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; int signr; clear_notify_signal(); if (unlikely(task_work_pending(current))) task_work_run(); if (!task_sigpending(current)) return false; if (unlikely(uprobe_deny_signal())) return false; /* * Do this once, we can't return to user-mode if freezing() == T. * do_signal_stop() and ptrace_stop() do freezable_schedule() and * thus do not need another check after return. */ try_to_freeze(); relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if * we should notify the parent, prepare_signal(SIGCONT) encodes * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { int why; if (signal->flags & SIGNAL_CLD_CONTINUED) why = CLD_CONTINUED; else why = CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; spin_unlock_irq(&sighand->siglock); /* * Notify the parent that we're continuing. This event is * always per-process and doesn't make whole lot of sense * for ptracers, who shouldn't consume the state via * wait(2) either, but, for backward compatibility, notify * the ptracer of the group leader too unless it's gonna be * a duplicate. */ read_lock(&tasklist_lock); do_notify_parent_cldstop(current, false, why); if (ptrace_reparented(current->group_leader)) do_notify_parent_cldstop(current->group_leader, true, why); read_unlock(&tasklist_lock); goto relock; } for (;;) { struct k_sigaction *ka; enum pid_type type; /* Has this task already been marked for death? */ if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->group_exec_task) { signr = SIGKILL; sigdelset(&current->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, &sighand->action[SIGKILL-1]); recalc_sigpending(); /* * implies do_group_exit() or return to PF_USER_WORKER, * no need to initialize ksig->info/etc. */ goto fatal; } if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) goto relock; if (unlikely(current->jobctl & (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { if (current->jobctl & JOBCTL_TRAP_MASK) { do_jobctl_trap(); spin_unlock_irq(&sighand->siglock); } else if (current->jobctl & JOBCTL_TRAP_FREEZE) do_freezer_trap(); goto relock; } /* * If the task is leaving the frozen state, let's update * cgroup counters and reset the frozen bit. */ if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); cgroup_leave_frozen(false); goto relock; } /* * Signals generated by the execution of an instruction * need to be delivered before any other pending signals * so that the instruction pointer in the signal stack * frame points to the faulting instruction. */ type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) signr = dequeue_signal(&current->blocked, &ksig->info, &type); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) { signr = ptrace_signal(signr, &ksig->info, type); if (!signr) continue; } ka = &sighand->action[signr-1]; /* Trace actually delivered signals. */ trace_signal_deliver(signr, &ksig->info, ka); if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { /* Run the handler. */ ksig->ka = *ka; if (ka->sa.sa_flags & SA_ONESHOT) ka->sa.sa_handler = SIG_DFL; break; /* will return non-zero "signr" value */ } /* * Now we are doing the default action for this signal. */ if (sig_kernel_ignore(signr)) /* Default is nothing. */ continue; /* * Global init gets no signals it doesn't want. * Container-init gets no signals it doesn't want from same * container. * * Note that if global/container-init sees a sig_kernel_only() * signal here, the signal must have been generated internally * or must have come from an ancestor namespace. In either * case, the signal cannot be dropped. */ if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && !sig_kernel_only(signr)) continue; if (sig_kernel_stop(signr)) { /* * The default action is to stop all threads in * the thread group. The job control signals * do nothing in an orphaned pgrp, but SIGSTOP * always works. Note that siglock needs to be * dropped during the call to is_orphaned_pgrp() * because of lock ordering with tasklist_lock. * This allows an intervening SIGCONT to be posted. * We need to check for that and bail out if necessary. */ if (signr != SIGSTOP) { spin_unlock_irq(&sighand->siglock); /* signals can be posted during this window */ if (is_current_pgrp_orphaned()) goto relock; spin_lock_irq(&sighand->siglock); } if (likely(do_signal_stop(signr))) { /* It released the siglock. */ goto relock; } /* * We didn't actually stop, due to a race * with SIGCONT or something like that. */ continue; } fatal: spin_unlock_irq(&sighand->siglock); if (unlikely(cgroup_task_frozen(current))) cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. */ current->flags |= PF_SIGNALED; if (sig_kernel_coredump(signr)) { if (print_fatal_signals) print_fatal_signal(signr); proc_coredump_connector(current); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with * their demise. If we lost the race with another * thread getting here, it set group_exit_code * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ do_coredump(&ksig->info); } /* * PF_USER_WORKER threads will catch and exit on fatal signals * themselves. They have cleanup that must be performed, so we * cannot call do_exit() on their behalf. Note that ksig won't * be properly initialized, PF_USER_WORKER's shouldn't use it. */ if (current->flags & PF_USER_WORKER) goto out; /* * Death signals, no core dump. */ do_group_exit(signr); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); ksig->sig = signr; if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) hide_si_addr_tag_bits(ksig); out: return signr > 0; } /** * signal_delivered - called after signal delivery to update blocked signals * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask * is always blocked), and the signal itself is blocked unless %SA_NODEFER * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ static void signal_delivered(struct ksignal *ksig, int stepping) { sigset_t blocked; /* A signal was successfully delivered, and the saved sigmask was stored on the signal frame, and will be restored by sigreturn. So we can simply clear the restore sigmask flag. */ clear_restore_sigmask(); sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask); if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) sigaddset(&blocked, ksig->sig); set_current_blocked(&blocked); if (current->sas_ss_flags & SS_AUTODISARM) sas_ss_reset(current); if (stepping) ptrace_notify(SIGTRAP, 0); } void signal_setup_done(int failed, struct ksignal *ksig, int stepping) { if (failed) force_sigsegv(ksig->sig); else signal_delivered(ksig, stepping); } /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take * the shared signals in @which since we will not. */ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) { sigset_t retarget; struct task_struct *t; sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); if (sigisemptyset(&retarget)) return; for_other_threads(tsk, t) { if (t->flags & PF_EXITING) continue; if (!has_pending_signals(&retarget, &t->blocked)) continue; /* Remove the signals this thread can handle. */ sigandsets(&retarget, &retarget, &t->blocked); if (!task_sigpending(t)) signal_wake_up(t, 0); if (sigisemptyset(&retarget)) break; } } void exit_signals(struct task_struct *tsk) { int group_stop = 0; sigset_t unblocked; /* * @tsk is about to have PF_EXITING set - lock out users which * expect stable threadgroup. */ cgroup_threadgroup_change_begin(tsk); if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; } spin_lock_irq(&tsk->sighand->siglock); /* * From now this task is not visible for group-wide signals, * see wants_signal(), do_signal_stop(). */ sched_mm_cid_exit_signals(tsk); tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); if (!task_sigpending(tsk)) goto out; unblocked = tsk->blocked; signotset(&unblocked); retarget_shared_pending(tsk, &unblocked); if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && task_participate_group_stop(tsk)) group_stop = CLD_STOPPED; out: spin_unlock_irq(&tsk->sighand->siglock); /* * If group stop has completed, deliver the notification. This * should always go to the real parent of the group leader. */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, false, group_stop); read_unlock(&tasklist_lock); } } /* * System call entry points. */ /** * sys_restart_syscall - restart a system call */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = &current->restart_block; return restart->fn(restart); } long do_no_restart_syscall(struct restart_block *param) { return -EINTR; } static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) { if (task_sigpending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ sigandnsets(&newblocked, newset, &current->blocked); retarget_shared_pending(tsk, &newblocked); } tsk->blocked = *newset; recalc_sigpending(); } /** * set_current_blocked - change current->blocked mask * @newset: new mask * * It is wrong to change ->blocked directly, this helper should be used * to ensure the process can't miss a shared signal we are going to block. */ void set_current_blocked(sigset_t *newset) { sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP)); __set_current_blocked(newset); } void __set_current_blocked(const sigset_t *newset) { struct task_struct *tsk = current; /* * In case the signal mask hasn't changed, there is nothing we need * to do. The current->blocked shouldn't be modified by other task. */ if (sigequalsets(&tsk->blocked, newset)) return; spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, newset); spin_unlock_irq(&tsk->sighand->siglock); } /* * This is also useful for kernel threads that want to temporarily * (or permanently) block certain signals. * * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel * interface happily blocks "unblockable" signals like SIGKILL * and friends. */ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { struct task_struct *tsk = current; sigset_t newset; /* Lockless, only current can change ->blocked, never from irq */ if (oldset) *oldset = tsk->blocked; switch (how) { case SIG_BLOCK: sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: sigandnsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: newset = *set; break; default: return -EINVAL; } __set_current_blocked(&newset); return 0; } EXPORT_SYMBOL(sigprocmask); /* * The api helps set app-provided sigmasks. * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. * * Note that it does set_restore_sigmask() in advance, so it must be always * paired with restore_saved_sigmask_unless() before return from syscall. */ int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #ifdef CONFIG_COMPAT int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { sigset_t kmask; if (!umask) return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (get_compat_sigset(&kmask, umask)) return -EFAULT; set_restore_sigmask(); current->saved_sigmask = current->blocked; set_current_blocked(&kmask); return 0; } #endif /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals * @nset: stores pending signals * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, sigset_t __user *, oset, size_t, sigsetsize) { sigset_t old_set, new_set; int error; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; old_set = current->blocked; if (nset) { if (copy_from_user(&new_set, nset, sizeof(sigset_t))) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } if (oset) { if (copy_to_user(oset, &old_set, sizeof(sigset_t))) return -EFAULT; } return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, compat_sigset_t __user *, oset, compat_size_t, sigsetsize) { sigset_t old_set = current->blocked; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (nset) { sigset_t new_set; int error; if (get_compat_sigset(&new_set, nset)) return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); error = sigprocmask(how, &new_set, NULL); if (error) return error; } return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0; } #endif static void do_sigpending(sigset_t *set) { spin_lock_irq(&current->sighand->siglock); sigorsets(set, &current->pending.signal, &current->signal->shared_pending.signal); spin_unlock_irq(&current->sighand->siglock); /* Outside the lock because only this thread touches it. */ sigandsets(set, &current->blocked, set); } /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sigsetsize)) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; if (sigsetsize > sizeof(*uset)) return -EINVAL; do_sigpending(&set); return put_compat_sigset(uset, &set, sigsetsize); } #endif static const struct { unsigned char limit, layout; } sig_sicodes[] = { [SIGILL] = { NSIGILL, SIL_FAULT }, [SIGFPE] = { NSIGFPE, SIL_FAULT }, [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, [SIGBUS] = { NSIGBUS, SIL_FAULT }, [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, #if defined(SIGEMT) [SIGEMT] = { NSIGEMT, SIL_FAULT }, #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, [SIGPOLL] = { NSIGPOLL, SIL_POLL }, [SIGSYS] = { NSIGSYS, SIL_SYS }, }; static bool known_siginfo_layout(unsigned sig, int si_code) { if (si_code == SI_KERNEL) return true; else if ((si_code > SI_USER)) { if (sig_specific_sicodes(sig)) { if (si_code <= sig_sicodes[sig].limit) return true; } else if (si_code <= NSIGPOLL) return true; } else if (si_code >= SI_DETHREAD) return true; else if (si_code == SI_ASYNCNL) return true; return false; } enum siginfo_layout siginfo_layout(unsigned sig, int si_code) { enum siginfo_layout layout = SIL_KILL; if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { if ((sig < ARRAY_SIZE(sig_sicodes)) && (si_code <= sig_sicodes[sig].limit)) { layout = sig_sicodes[sig].layout; /* Handle the exceptions */ if ((sig == SIGBUS) && (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO)) layout = SIL_FAULT_MCEERR; else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR)) layout = SIL_FAULT_BNDERR; #ifdef SEGV_PKUERR else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) layout = SIL_FAULT_PKUERR; #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_FAULT_PERF_EVENT; else if (IS_ENABLED(CONFIG_SPARC) && (sig == SIGILL) && (si_code == ILL_ILLTRP)) layout = SIL_FAULT_TRAPNO; else if (IS_ENABLED(CONFIG_ALPHA) && ((sig == SIGFPE) || ((sig == SIGTRAP) && (si_code == TRAP_UNK)))) layout = SIL_FAULT_TRAPNO; } else if (si_code <= NSIGPOLL) layout = SIL_POLL; } else { if (si_code == SI_TIMER) layout = SIL_TIMER; else if (si_code == SI_SIGIO) layout = SIL_POLL; else if (si_code < 0) layout = SIL_RT; } return layout; } static inline char __user *si_expansion(const siginfo_t __user *info) { return ((char __user *)info) + sizeof(struct kernel_siginfo); } int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from) { char __user *expansion = si_expansion(to); if (copy_to_user(to, from , sizeof(struct kernel_siginfo))) return -EFAULT; if (clear_user(expansion, SI_EXPANSION_SIZE)) return -EFAULT; return 0; } static int post_copy_siginfo_from_user(kernel_siginfo_t *info, const siginfo_t __user *from) { if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) { char __user *expansion = si_expansion(from); char buf[SI_EXPANSION_SIZE]; int i; /* * An unknown si_code might need more than * sizeof(struct kernel_siginfo) bytes. Verify all of the * extra bytes are 0. This guarantees copy_siginfo_to_user * will return this data to userspace exactly. */ if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE)) return -EFAULT; for (i = 0; i < SI_EXPANSION_SIZE; i++) { if (buf[i] != 0) return -E2BIG; } } return 0; } static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; to->si_signo = signo; return post_copy_siginfo_from_user(to, from); } int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) { if (copy_from_user(to, from, sizeof(struct kernel_siginfo))) return -EFAULT; return post_copy_siginfo_from_user(to, from); } #ifdef CONFIG_COMPAT /** * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo * @to: compat siginfo destination * @from: kernel siginfo source * * Note: This function does not work properly for the SIGCHLD on x32, but * fortunately it doesn't have to. The only valid callers for this function are * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. * The latter does not care because SIGCHLD will never cause a coredump. */ void copy_siginfo_to_external32(struct compat_siginfo *to, const struct kernel_siginfo *from) { memset(to, 0, sizeof(*to)); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = ptr_to_compat(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = ptr_to_compat(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_lower = ptr_to_compat(from->si_lower); to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = ptr_to_compat(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; to->si_utime = from->si_utime; to->si_stime = from->si_stime; break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = ptr_to_compat(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } } int __copy_siginfo_to_user32(struct compat_siginfo __user *to, const struct kernel_siginfo *from) { struct compat_siginfo new; copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; return 0; } static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo *from) { clear_siginfo(to); to->si_signo = from->si_signo; to->si_errno = from->si_errno; to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: to->si_pid = from->si_pid; to->si_uid = from->si_uid; break; case SIL_TIMER: to->si_tid = from->si_tid; to->si_overrun = from->si_overrun; to->si_int = from->si_int; break; case SIL_POLL: to->si_band = from->si_band; to->si_fd = from->si_fd; break; case SIL_FAULT: to->si_addr = compat_ptr(from->si_addr); break; case SIL_FAULT_TRAPNO: to->si_addr = compat_ptr(from->si_addr); to->si_trapno = from->si_trapno; break; case SIL_FAULT_MCEERR: to->si_addr = compat_ptr(from->si_addr); to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = compat_ptr(from->si_addr); to->si_lower = compat_ptr(from->si_lower); to->si_upper = compat_ptr(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = compat_ptr(from->si_addr); to->si_pkey = from->si_pkey; break; case SIL_FAULT_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); to->si_perf_data = from->si_perf_data; to->si_perf_type = from->si_perf_type; to->si_perf_flags = from->si_perf_flags; break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_status = from->si_status; #ifdef CONFIG_X86_X32_ABI if (in_x32_syscall()) { to->si_utime = from->_sifields._sigchld_x32._utime; to->si_stime = from->_sifields._sigchld_x32._stime; } else #endif { to->si_utime = from->si_utime; to->si_stime = from->si_stime; } break; case SIL_RT: to->si_pid = from->si_pid; to->si_uid = from->si_uid; to->si_int = from->si_int; break; case SIL_SYS: to->si_call_addr = compat_ptr(from->si_call_addr); to->si_syscall = from->si_syscall; to->si_arch = from->si_arch; break; } return 0; } static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; from.si_signo = signo; return post_copy_siginfo_from_user32(to, &from); } int copy_siginfo_from_user32(struct kernel_siginfo *to, const struct compat_siginfo __user *ufrom) { struct compat_siginfo from; if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) return -EFAULT; return post_copy_siginfo_from_user32(to, &from); } #endif /* CONFIG_COMPAT */ /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for * @info: if non-null, the signal's siginfo is returned here * @ts: upper bound on process time suspension */ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, const struct timespec64 *ts) { ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; enum pid_type type; int sig, ret = 0; if (ts) { if (!timespec64_valid(ts)) return -EINVAL; timeout = timespec64_to_ktime(*ts); to = &timeout; } /* * Invert the set of allowed signals to get those we want to block. */ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); sig = dequeue_signal(&mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested * while we are sleeping in so that we'll be awakened when * they arrive. Unblocking is always fine, we can avoid * set_current_blocked(). */ tsk->real_blocked = tsk->blocked; sigandsets(&tsk->blocked, &tsk->blocked, &mask); recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); sig = dequeue_signal(&mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); if (sig) return sig; return ret ? -EINTR : -EAGAIN; } /** * sys_rt_sigtimedwait - synchronously wait for queued signals specified * in @uthese * @uthese: queued signals to wait for * @uinfo: if non-null, the signal's siginfo is returned here * @uts: upper bound on process time suspension * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct __kernel_timespec __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_timespec64(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct old_timespec32 __user *, uts, size_t, sigsetsize) { sigset_t these; struct timespec64 ts; kernel_siginfo_t info; int ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; if (uts) { if (get_old_timespec32(&ts, uts)) return -EFAULT; } ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_timespec64(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { sigset_t s; struct timespec64 t; kernel_siginfo_t info; long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&s, uthese)) return -EFAULT; if (uts) { if (get_old_timespec32(&t, uts)) return -EFAULT; } ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); if (ret > 0 && uinfo) { if (copy_siginfo_to_user32(uinfo, &info)) ret = -EFAULT; } return ret; } #endif #endif static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info, enum pid_type type) { clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER; info->si_pid = task_tgid_vnr(current); info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); } /** * sys_kill - send a signal to a process * @pid: the PID of the process * @sig: signal to be sent */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_TGID); return kill_something_info(sig, &info, pid); } /* * Verify that the signaler and signalee either are in the same pid namespace * or that the signaler's pid namespace is an ancestor of the signalee's pid * namespace. */ static bool access_pidfd_pidns(struct pid *pid) { struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *p = ns_of_pid(pid); for (;;) { if (!p) return false; if (p == active) break; p = p->parent; } return true; } static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t __user *info) { #ifdef CONFIG_COMPAT /* * Avoid hooking up compat syscalls and instead handle necessary * conversions here. Note, this is a stop-gap measure and should not be * considered a generic solution. */ if (in_compat_syscall()) return copy_siginfo_from_user32( kinfo, (struct compat_siginfo __user *)info); #endif return copy_siginfo_from_user(kinfo, info); } static struct pid *pidfd_to_pid(const struct file *file) { struct pid *pid; pid = pidfd_pid(file); if (!IS_ERR(pid)) return pid; return tgid_pidfd_to_pid(file); } #define PIDFD_SEND_SIGNAL_FLAGS \ (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ PIDFD_SIGNAL_PROCESS_GROUP) static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type, siginfo_t __user *info, unsigned int flags) { kernel_siginfo_t kinfo; switch (flags) { case PIDFD_SIGNAL_THREAD: type = PIDTYPE_PID; break; case PIDFD_SIGNAL_THREAD_GROUP: type = PIDTYPE_TGID; break; case PIDFD_SIGNAL_PROCESS_GROUP: type = PIDTYPE_PGID; break; } if (info) { int ret; ret = copy_siginfo_from_user_any(&kinfo, info); if (unlikely(ret)) return ret; if (unlikely(sig != kinfo.si_signo)) return -EINVAL; /* Only allow sending arbitrary signals to yourself. */ if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) return -EPERM; } else { prepare_kill_siginfo(sig, &kinfo, type); } if (type == PIDTYPE_PGID) return kill_pgrp_info(sig, &kinfo, pid); return kill_pid_info_type(sig, &kinfo, pid, type); } /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process * @sig: signal to send * @info: signal info * @flags: future flags * * Send the signal to the thread group or to the individual thread depending * on PIDFD_THREAD. * In the future extension to @flags may be used to override the default scope * of @pidfd. * * Return: 0 on success, negative errno on failure */ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, siginfo_t __user *, info, unsigned int, flags) { struct pid *pid; enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) return -EINVAL; /* Ensure that only a single signal scope determining flag is set. */ if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; switch (pidfd) { case PIDFD_SELF_THREAD: pid = get_task_pid(current, PIDTYPE_PID); type = PIDTYPE_PID; break; case PIDFD_SELF_THREAD_GROUP: pid = get_task_pid(current, PIDTYPE_TGID); type = PIDTYPE_TGID; break; default: { CLASS(fd, f)(pidfd); if (fd_empty(f)) return -EBADF; /* Is this a pidfd? */ pid = pidfd_to_pid(fd_file(f)); if (IS_ERR(pid)) return PTR_ERR(pid); if (!access_pidfd_pidns(pid)) return -EINVAL; /* Infer scope from the type of pidfd. */ if (fd_file(f)->f_flags & PIDFD_THREAD) type = PIDTYPE_PID; else type = PIDTYPE_TGID; return do_pidfd_send_signal(pid, sig, type, info, flags); } } return do_pidfd_send_signal(pid, sig, type, info, flags); } static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { struct task_struct *p; int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. */ if (!error && sig) { error = do_send_sig_info(sig, info, p, PIDTYPE_PID); /* * If lock_task_sighand() failed we pretend the task * dies after receiving the signal. The window is tiny, * and the signal is private anyway. */ if (unlikely(error == -ESRCH)) error = 0; } } rcu_read_unlock(); return error; } static int do_tkill(pid_t tgid, pid_t pid, int sig) { struct kernel_siginfo info; prepare_kill_siginfo(sig, &info, PIDTYPE_PID); return do_send_specific(tgid, pid, sig, &info); } /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread * @pid: the PID of the thread * @sig: signal to be sent * * This syscall also checks the @tgid and returns -ESRCH even if the PID * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; return do_tkill(tgid, pid, sig); } /** * sys_tkill - send signal to one specific task * @pid: the PID of the task * @sig: signal to be sent * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) { /* This is only valid for single tasks */ if (pid <= 0) return -EINVAL; return do_tkill(0, pid, sig); } static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info) { /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; /* POSIX.1b doesn't mention process groups. */ return kill_proc_info(sig, info, pid); } /** * sys_rt_sigqueueinfo - send signal information to a signal * @pid: the PID of the thread * @sig: signal to be sent * @uinfo: signal info to be sent */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_sigqueueinfo(pid, sig, &info); } #endif static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info) { /* This is only valid for single tasks */ if (pid <= 0 || tgid <= 0) return -EINVAL; /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ if ((info->si_code >= 0 || info->si_code == SI_TKILL) && (task_pid_vnr(current) != pid)) return -EPERM; return do_send_specific(tgid, pid, sig, info); } SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, compat_pid_t, tgid, compat_pid_t, pid, int, sig, struct compat_siginfo __user *, uinfo) { kernel_siginfo_t info; int ret = __copy_siginfo_from_user32(sig, &info, uinfo); if (unlikely(ret)) return ret; return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); } #endif /* * For kthreads only, must not be used if cloned with CLONE_SIGHAND */ void kernel_sigaction(int sig, __sighandler_t action) { spin_lock_irq(&current->sighand->siglock); current->sighand->action[sig - 1].sa.sa_handler = action; if (action == SIG_IGN) { sigset_t mask; sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(current, &mask, &current->signal->shared_pending); flush_sigqueue_mask(current, &mask, &current->pending); recalc_sigpending(); } spin_unlock_irq(&current->sighand->siglock); } EXPORT_SYMBOL(kernel_sigaction); void __weak sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { } int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *p = current, *t; struct k_sigaction *k; sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; k = &p->sighand->action[sig-1]; spin_lock_irq(&p->sighand->siglock); if (k->sa.sa_flags & SA_IMMUTABLE) { spin_unlock_irq(&p->sighand->siglock); return -EINVAL; } if (oact) *oact = *k; /* * Make sure that we never accidentally claim to support SA_UNSUPPORTED, * e.g. by having an architecture use the bit in their uapi. */ BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED); /* * Clear unknown flag bits in order to allow userspace to detect missing * support for flag bits and to allow the kernel to use non-uapi bits * internally. */ if (act) act->sa.sa_flags &= UAPI_SA_FLAGS; if (oact) oact->sa.sa_flags &= UAPI_SA_FLAGS; sigaction_compat_abi(act, oact); if (act) { bool was_ignored = k->sa.sa_handler == SIG_IGN; sigdelsetmask(&act->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); *k = *act; /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is * pending shall cause the pending signal to be discarded, * whether or not it is blocked." * * "Setting a signal action to SIG_DFL for a signal that is * pending and whose default action is to ignore the signal * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ if (sig_handler_ignored(sig_handler(p, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); flush_sigqueue_mask(p, &mask, &p->signal->shared_pending); for_each_thread(p, t) flush_sigqueue_mask(p, &mask, &t->pending); } else if (was_ignored) { posixtimer_sig_unignore(p, sig); } } spin_unlock_irq(&p->sighand->siglock); return 0; } #ifdef CONFIG_DYNAMIC_SIGFRAME static inline void sigaltstack_lock(void) __acquires(&current->sighand->siglock) { spin_lock_irq(&current->sighand->siglock); } static inline void sigaltstack_unlock(void) __releases(&current->sighand->siglock) { spin_unlock_irq(&current->sighand->siglock); } #else static inline void sigaltstack_lock(void) { } static inline void sigaltstack_unlock(void) { } #endif static int do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, size_t min_ss_size) { struct task_struct *t = current; int ret = 0; if (oss) { memset(oss, 0, sizeof(stack_t)); oss->ss_sp = (void __user *) t->sas_ss_sp; oss->ss_size = t->sas_ss_size; oss->ss_flags = sas_ss_flags(sp) | (current->sas_ss_flags & SS_FLAG_BITS); } if (ss) { void __user *ss_sp = ss->ss_sp; size_t ss_size = ss->ss_size; unsigned ss_flags = ss->ss_flags; int ss_mode; if (unlikely(on_sig_stack(sp))) return -EPERM; ss_mode = ss_flags & ~SS_FLAG_BITS; if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK && ss_mode != 0)) return -EINVAL; /* * Return before taking any locks if no actual * sigaltstack changes were requested. */ if (t->sas_ss_sp == (unsigned long)ss_sp && t->sas_ss_size == ss_size && t->sas_ss_flags == ss_flags) return 0; sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; ss_sp = NULL; } else { if (unlikely(ss_size < min_ss_size)) ret = -ENOMEM; if (!sigaltstack_size_valid(ss_size)) ret = -ENOMEM; } if (!ret) { t->sas_ss_sp = (unsigned long) ss_sp; t->sas_ss_size = ss_size; t->sas_ss_flags = ss_flags; } sigaltstack_unlock(); } return ret; } SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss) { stack_t new, old; int err; if (uss && copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL, current_user_stack_pointer(), MINSIGSTKSZ); if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t))) err = -EFAULT; return err; } int restore_altstack(const stack_t __user *uss) { stack_t new; if (copy_from_user(&new, uss, sizeof(stack_t))) return -EFAULT; (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(), MINSIGSTKSZ); /* squash all but EFAULT for now */ return 0; } int __save_altstack(stack_t __user *uss, unsigned long sp) { struct task_struct *t = current; int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #ifdef CONFIG_COMPAT static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr, compat_stack_t __user *uoss_ptr) { stack_t uss, uoss; int ret; if (uss_ptr) { compat_stack_t uss32; if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t))) return -EFAULT; uss.ss_sp = compat_ptr(uss32.ss_sp); uss.ss_flags = uss32.ss_flags; uss.ss_size = uss32.ss_size; } ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, compat_user_stack_pointer(), COMPAT_MINSIGSTKSZ); if (ret >= 0 && uoss_ptr) { compat_stack_t old; memset(&old, 0, sizeof(old)); old.ss_sp = ptr_to_compat(uoss.ss_sp); old.ss_flags = uoss.ss_flags; old.ss_size = uoss.ss_size; if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t))) ret = -EFAULT; } return ret; } COMPAT_SYSCALL_DEFINE2(sigaltstack, const compat_stack_t __user *, uss_ptr, compat_stack_t __user *, uoss_ptr) { return do_compat_sigaltstack(uss_ptr, uoss_ptr); } int compat_restore_altstack(const compat_stack_t __user *uss) { int err = do_compat_sigaltstack(uss, NULL); /* squash all but -EFAULT for now */ return err == -EFAULT ? err : 0; } int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) { int err; struct task_struct *t = current; err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) | __put_user(t->sas_ss_flags, &uss->ss_flags) | __put_user(t->sas_ss_size, &uss->ss_size); return err; } #endif #ifdef __ARCH_WANT_SYS_SIGPENDING /** * sys_sigpending - examine pending signals * @uset: where mask of pending signal is returned */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; do_sigpending(&set); if (copy_to_user(uset, &set, sizeof(old_sigset_t))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; do_sigpending(&set); return put_user(set.sig[0], set32); } #endif #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK /** * sys_sigprocmask - examine and change blocked signals * @how: whether to add, remove, or set signals * @nset: signals to add or remove (if non-null) * @oset: previous value of signal mask if non-null * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { old_sigset_t old_set, new_set; sigset_t new_blocked; old_set = current->blocked.sig[0]; if (nset) { if (copy_from_user(&new_set, nset, sizeof(*nset))) return -EFAULT; new_blocked = current->blocked; switch (how) { case SIG_BLOCK: sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: new_blocked.sig[0] = new_set; break; default: return -EINVAL; } set_current_blocked(&new_blocked); } if (oset) { if (copy_to_user(oset, &old_set, sizeof(*oset))) return -EFAULT; } return 0; } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifndef CONFIG_ODD_RT_SIGACTION /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent * @act: new sigaction * @oact: used to save the previous sigaction * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); if (ret) return ret; if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct compat_sigaction __user *, act, struct compat_sigaction __user *, oact, compat_size_t, sigsetsize) { struct k_sigaction new_ka, old_ka; #ifdef __ARCH_HAS_SA_RESTORER compat_uptr_t restorer; #endif int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (act) { compat_uptr_t handler; ret = get_user(handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(handler); #ifdef __ARCH_HAS_SA_RESTORER ret |= get_user(restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask); ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask, sizeof(oact->sa_mask)); ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); #endif } return ret; } #endif #endif /* !CONFIG_ODD_RT_SIGACTION */ #ifdef CONFIG_OLD_SIGACTION SYSCALL_DEFINE3(sigaction, int, sig, const struct old_sigaction __user *, act, struct old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; if (act) { old_sigset_t mask; if (!access_ok(act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_COMPAT_OLD_SIGACTION COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, const struct compat_old_sigaction __user *, act, struct compat_old_sigaction __user *, oact) { struct k_sigaction new_ka, old_ka; int ret; compat_old_sigset_t mask; compat_uptr_t handler, restorer; if (act) { if (!access_ok(act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || __get_user(mask, &act->sa_mask)) return -EFAULT; #ifdef __ARCH_HAS_KA_RESTORER new_ka.ka_restorer = NULL; #endif new_ka.sa.sa_handler = compat_ptr(handler); new_ka.sa.sa_restorer = compat_ptr(restorer); siginitset(&new_ka.sa.sa_mask, mask); } ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(oact, sizeof(*oact)) || __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; } return ret; } #endif #ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. */ SYSCALL_DEFINE0(sgetmask) { /* SMP safe */ return current->blocked.sig[0]; } SYSCALL_DEFINE1(ssetmask, int, newmask) { int old = current->blocked.sig[0]; sigset_t newset; siginitset(&newset, newmask); set_current_blocked(&newset); return old; } #endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* * For backwards compatibility. Functionality superseded by sigaction. */ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) { struct k_sigaction new_sa, old_sa; int ret; new_sa.sa.sa_handler = handler; new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; sigemptyset(&new_sa.sa.sa_mask); ret = do_sigaction(sig, &new_sa, &old_sa); return ret ? ret : (unsigned long)old_sa.sa.sa_handler; } #endif /* __ARCH_WANT_SYS_SIGNAL */ #ifdef __ARCH_WANT_SYS_PAUSE SYSCALL_DEFINE0(pause) { while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } return -ERESTARTNOHAND; } #endif static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); while (!signal_pending(current)) { __set_current_state(TASK_INTERRUPTIBLE); schedule(); } set_restore_sigmask(); return -ERESTARTNOHAND; } /** * sys_rt_sigsuspend - replace the signal mask for a value with the * @unewset value until a signal is received * @unewset: new signal mask value * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (copy_from_user(&newset, unewset, sizeof(newset))) return -EFAULT; return sigsuspend(&newset); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize) { sigset_t newset; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) return -EINVAL; if (get_compat_sigset(&newset, unewset)) return -EFAULT; return sigsuspend(&newset); } #endif #ifdef CONFIG_OLD_SIGSUSPEND SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif #ifdef CONFIG_OLD_SIGSUSPEND3 SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask) { sigset_t blocked; siginitset(&blocked, mask); return sigsuspend(&blocked); } #endif __weak const char *arch_vma_name(struct vm_area_struct *vma) { return NULL; } static inline void siginfo_buildtime_checks(void) { BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); /* Verify the offsets in the two siginfos match */ #define CHECK_OFFSET(field) \ BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field)) /* kill */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); /* timer */ CHECK_OFFSET(si_tid); CHECK_OFFSET(si_overrun); CHECK_OFFSET(si_value); /* rt */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_value); /* sigchld */ CHECK_OFFSET(si_pid); CHECK_OFFSET(si_uid); CHECK_OFFSET(si_status); CHECK_OFFSET(si_utime); CHECK_OFFSET(si_stime); /* sigfault */ CHECK_OFFSET(si_addr); CHECK_OFFSET(si_trapno); CHECK_OFFSET(si_addr_lsb); CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); CHECK_OFFSET(si_perf_data); CHECK_OFFSET(si_perf_type); CHECK_OFFSET(si_perf_flags); /* sigpoll */ CHECK_OFFSET(si_band); CHECK_OFFSET(si_fd); /* sigsys */ CHECK_OFFSET(si_call_addr); CHECK_OFFSET(si_syscall); CHECK_OFFSET(si_arch); #undef CHECK_OFFSET /* usb asyncio */ BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != offsetof(struct siginfo, si_addr)); if (sizeof(int) == sizeof(void __user *)) { BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != sizeof(void __user *)); } else { BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + sizeof_field(struct siginfo, si_uid)) != sizeof(void __user *)); BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != offsetof(struct siginfo, si_uid)); } #ifdef CONFIG_COMPAT BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != offsetof(struct compat_siginfo, si_addr)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof(compat_uptr_t)); BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != sizeof_field(struct siginfo, si_pid)); #endif } #if defined(CONFIG_SYSCTL) static const struct ctl_table signal_debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { .procname = "exception-trace", .data = &show_unhandled_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, #endif }; static const struct ctl_table signal_table[] = { { .procname = "print-fatal-signals", .data = &print_fatal_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static int __init init_signal_sysctls(void) { register_sysctl_init("debug", signal_debug_table); register_sysctl_init("kernel", signal_table); return 0; } early_initcall(init_signal_sysctls); #endif /* CONFIG_SYSCTL */ void __init signals_init(void) { siginfo_buildtime_checks(); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT); } #ifdef CONFIG_KGDB_KDB #include <linux/kdb.h> /* * kdb_send_sig - Allows kdb to send signals without exposing * signal internals. This function checks if the required locks are * available before calling the main signal code, to avoid kdb * deadlocks. */ void kdb_send_sig(struct task_struct *t, int sig) { static struct task_struct *kdb_prev_t; int new_t, ret; if (!spin_trylock(&t->sighand->siglock)) { kdb_printf("Can't do kill command now.\n" "The sigmask lock is held somewhere else in " "kernel, try again later\n"); return; } new_t = kdb_prev_t != t; kdb_prev_t = t; if (!task_is_running(t) && new_t) { spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" "on the run queue locks. " "The signal has _not_ been sent.\n" "Reissue the kill command if you want to risk " "the deadlock.\n"); return; } ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID); spin_unlock(&t->sighand->siglock); if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", sig, t->pid); else kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid); } #endif /* CONFIG_KGDB_KDB */
218 260 39 39 131 260 30 218 132 260 260 258 218 218 132 132 216 146 123 218 218 30 218 215 216 133 18 132 132 132 132 131 218 218 30 30 30 133 133 18 18 297 231 297 23 299 298 598 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment on MAX_NR_TIERS */ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); if (!(flags & BIT(PG_referenced))) return 0; /* * Return the total number of accesses including PG_referenced. Also see * the comment on LRU_REFS_FLAGS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ * | Accessed through page tables and | Accessed through file descriptors | * | promoted by folio_update_gen() | and protected by folio_inc_gen() | * +-----------------------------------+-----------------------------------+ * | PG_active (set while isolated) | | * +-----------------+-----------------+-----------------+-----------------+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | * +-----------------------------------+-----------------------------------+ * |<---------- MIN_NR_GENS ---------->| | * |<---------------------------- MAX_NR_GENS ---------------------------->| */ if (folio_test_active(folio)) gen = MIN_NR_GENS - folio_test_workingset(folio); else if (reclaiming) gen = MAX_NR_GENS; else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else gen = MAX_NR_GENS - folio_test_workingset(folio); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ void pfnmap_track_ctx_release(struct kref *ref); static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * Returns true if an uffd-wp pte was installed, false otherwise. */ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } #endif return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif
4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 // SPDX-License-Identifier: GPL-2.0-or-later /* * lib/plist.c * * Descending-priority-sorted double-linked list * * (C) 2002-2003 Intel Corp * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>. * * 2001-2005 (c) MontaVista Software, Inc. * Daniel Walker <dwalker@mvista.com> * * (C) 2005 Thomas Gleixner <tglx@linutronix.de> * * Simplifications of the original code by * Oleg Nesterov <oleg@tv-sign.ru> * * Based on simple lists (include/linux/list.h). * * This file contains the add / del functions which are considered to * be too large to inline. See include/linux/plist.h for further * information. */ #include <linux/bug.h> #include <linux/plist.h> #ifdef CONFIG_DEBUG_PLIST static struct plist_head test_head; static void plist_check_prev_next(struct list_head *t, struct list_head *p, struct list_head *n) { WARN(n->prev != p || p->next != n, "top: %p, n: %p, p: %p\n" "prev: %p, n: %p, p: %p\n" "next: %p, n: %p, p: %p\n", t, t->next, t->prev, p, p->next, p->prev, n, n->next, n->prev); } static void plist_check_list(struct list_head *top) { struct list_head *prev = top, *next = top->next; plist_check_prev_next(top, prev, next); while (next != top) { WRITE_ONCE(prev, next); WRITE_ONCE(next, prev->next); plist_check_prev_next(top, prev, next); } } static void plist_check_head(struct plist_head *head) { if (!plist_head_empty(head)) plist_check_list(&plist_first(head)->prio_list); plist_check_list(&head->node_list); } #else # define plist_check_head(h) do { } while (0) #endif /** * plist_add - add @node to @head * * @node: &struct plist_node pointer * @head: &struct plist_head pointer */ void plist_add(struct plist_node *node, struct plist_head *head) { struct plist_node *first, *iter, *prev = NULL, *last, *reverse_iter; struct list_head *node_next = &head->node_list; plist_check_head(head); WARN_ON(!plist_node_empty(node)); WARN_ON(!list_empty(&node->prio_list)); if (plist_head_empty(head)) goto ins_node; first = iter = plist_first(head); last = reverse_iter = list_entry(first->prio_list.prev, struct plist_node, prio_list); do { if (node->prio < iter->prio) { node_next = &iter->node_list; break; } else if (node->prio >= reverse_iter->prio) { prev = reverse_iter; iter = list_entry(reverse_iter->prio_list.next, struct plist_node, prio_list); if (likely(reverse_iter != last)) node_next = &iter->node_list; break; } prev = iter; iter = list_entry(iter->prio_list.next, struct plist_node, prio_list); reverse_iter = list_entry(reverse_iter->prio_list.prev, struct plist_node, prio_list); } while (iter != first); if (!prev || prev->prio != node->prio) list_add_tail(&node->prio_list, &iter->prio_list); ins_node: list_add_tail(&node->node_list, node_next); plist_check_head(head); } /** * plist_del - Remove a @node from plist. * * @node: &struct plist_node pointer - entry to be removed * @head: &struct plist_head pointer - list head */ void plist_del(struct plist_node *node, struct plist_head *head) { plist_check_head(head); if (!list_empty(&node->prio_list)) { if (node->node_list.next != &head->node_list) { struct plist_node *next; next = list_entry(node->node_list.next, struct plist_node, node_list); /* add the next plist_node into prio_list */ if (list_empty(&next->prio_list)) list_add(&next->prio_list, &node->prio_list); } list_del_init(&node->prio_list); } list_del_init(&node->node_list); plist_check_head(head); } /** * plist_requeue - Requeue @node at end of same-prio entries. * * This is essentially an optimized plist_del() followed by * plist_add(). It moves an entry already in the plist to * after any other same-priority entries. * * @node: &struct plist_node pointer - entry to be moved * @head: &struct plist_head pointer - list head */ void plist_requeue(struct plist_node *node, struct plist_head *head) { struct plist_node *iter; struct list_head *node_next = &head->node_list; plist_check_head(head); BUG_ON(plist_head_empty(head)); BUG_ON(plist_node_empty(node)); if (node == plist_last(head)) return; iter = plist_next(node); if (node->prio != iter->prio) return; plist_del(node, head); /* * After plist_del(), iter is the replacement of the node. If the node * was on prio_list, take shortcut to find node_next instead of looping. */ if (!list_empty(&iter->prio_list)) { iter = list_entry(iter->prio_list.next, struct plist_node, prio_list); node_next = &iter->node_list; goto queue; } plist_for_each_continue(iter, head) { if (node->prio != iter->prio) { node_next = &iter->node_list; break; } } queue: list_add_tail(&node->node_list, node_next); plist_check_head(head); } #ifdef CONFIG_DEBUG_PLIST #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/module.h> #include <linux/init.h> static struct plist_node __initdata test_node[241]; static void __init plist_test_check(int nr_expect) { struct plist_node *first, *prio_pos, *node_pos; if (plist_head_empty(&test_head)) { BUG_ON(nr_expect != 0); return; } prio_pos = first = plist_first(&test_head); plist_for_each(node_pos, &test_head) { if (nr_expect-- < 0) break; if (node_pos == first) continue; if (node_pos->prio == prio_pos->prio) { BUG_ON(!list_empty(&node_pos->prio_list)); continue; } BUG_ON(prio_pos->prio > node_pos->prio); BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list); prio_pos = node_pos; } BUG_ON(nr_expect != 0); BUG_ON(prio_pos->prio_list.next != &first->prio_list); } static void __init plist_test_requeue(struct plist_node *node) { plist_requeue(node, &test_head); if (node != plist_last(&test_head)) BUG_ON(node->prio == plist_next(node)->prio); } static int __init plist_test(void) { int nr_expect = 0, i, loop; unsigned int r = local_clock(); printk(KERN_DEBUG "start plist test\n"); plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) plist_node_init(test_node + i, 0); for (loop = 0; loop < 1000; loop++) { r = r * 193939 % 47629; i = r % ARRAY_SIZE(test_node); if (plist_node_empty(test_node + i)) { r = r * 193939 % 47629; test_node[i].prio = r % 99; plist_add(test_node + i, &test_head); nr_expect++; } else { plist_del(test_node + i, &test_head); nr_expect--; } plist_test_check(nr_expect); if (!plist_node_empty(test_node + i)) { plist_test_requeue(test_node + i); plist_test_check(nr_expect); } } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) continue; plist_del(test_node + i, &test_head); nr_expect--; plist_test_check(nr_expect); } printk(KERN_DEBUG "end plist test\n"); /* Worst case test for plist_add() */ unsigned int test_data[241]; for (i = 0; i < ARRAY_SIZE(test_data); i++) test_data[i] = i; ktime_t start, end, time_elapsed = 0; plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) { plist_node_init(test_node + i, 0); test_node[i].prio = test_data[i]; } for (i = 0; i < ARRAY_SIZE(test_node); i++) { if (plist_node_empty(test_node + i)) { start = ktime_get(); plist_add(test_node + i, &test_head); end = ktime_get(); time_elapsed += (end - start); } } pr_debug("plist_add worst case test time elapsed %lld\n", time_elapsed); return 0; } module_init(plist_test); #endif
288 288 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CONTEXT_TRACKING_STATE_H #define _LINUX_CONTEXT_TRACKING_STATE_H #include <linux/percpu.h> #include <linux/static_key.h> #include <linux/context_tracking_irq.h> /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define CT_NESTING_IRQ_NONIDLE ((LONG_MAX / 2) + 1) enum ctx_state { CT_STATE_DISABLED = -1, /* returned by ct_state() if unknown */ CT_STATE_KERNEL = 0, CT_STATE_IDLE = 1, CT_STATE_USER = 2, CT_STATE_GUEST = 3, CT_STATE_MAX = 4, }; /* Odd value for watching, else even. */ #define CT_RCU_WATCHING CT_STATE_MAX #define CT_STATE_MASK (CT_STATE_MAX - 1) #define CT_RCU_WATCHING_MASK (~CT_STATE_MASK) struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING_USER /* * When active is false, probes are unset in order * to minimize overhead: TIF flags are cleared * and calls to user_enter/exit are ignored. This * may be further optimized using static keys. */ bool active; int recursion; #endif #ifdef CONFIG_CONTEXT_TRACKING atomic_t state; #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE long nesting; /* Track process nesting level. */ long nmi_nesting; /* Track irq/NMI nesting level. */ #endif }; #ifdef CONFIG_CONTEXT_TRACKING DECLARE_PER_CPU(struct context_tracking, context_tracking); #endif #ifdef CONFIG_CONTEXT_TRACKING_USER static __always_inline int __ct_state(void) { return raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_STATE_MASK; } #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE static __always_inline int ct_rcu_watching(void) { return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_rcu_watching_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline long ct_nesting(void) { return __this_cpu_read(context_tracking.nesting); } static __always_inline long ct_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return ct->nesting; } static __always_inline long ct_nmi_nesting(void) { return __this_cpu_read(context_tracking.nmi_nesting); } static __always_inline long ct_nmi_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); return ct->nmi_nesting; } #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ #ifdef CONFIG_CONTEXT_TRACKING_USER extern struct static_key_false context_tracking_key; static __always_inline bool context_tracking_enabled(void) { return static_branch_unlikely(&context_tracking_key); } static __always_inline bool context_tracking_enabled_cpu(int cpu) { return context_tracking_enabled() && per_cpu(context_tracking.active, cpu); } static __always_inline bool context_tracking_enabled_this_cpu(void) { return context_tracking_enabled() && __this_cpu_read(context_tracking.active); } /** * ct_state() - return the current context tracking state if known * * Returns the current cpu's context tracking state if context tracking * is enabled. If context tracking is disabled, returns * CT_STATE_DISABLED. This should be used primarily for debugging. */ static __always_inline int ct_state(void) { int ret; if (!context_tracking_enabled()) return CT_STATE_DISABLED; preempt_disable(); ret = __ct_state(); preempt_enable(); return ret; } #else static __always_inline bool context_tracking_enabled(void) { return false; } static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; } static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; } #endif /* CONFIG_CONTEXT_TRACKING_USER */ #endif
6 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_taprio.c Time Aware Priority Scheduler * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> * */ #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/tcp.h> #define TAPRIO_STAT_NOT_SET (~0ULL) #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); static struct static_key_false taprio_have_broken_mqprio; static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_SUPPORTED_FLAGS \ (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes */ u64 gate_duration[TC_MAX_QUEUE]; atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time */ ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */ ktime_t end_time; ktime_t next_txtime; int index; u32 gate_mask; u32 interval; u8 command; }; struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule. */ u64 max_open_gate_duration[TC_MAX_QUEUE]; u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; }; struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; struct __tc_taprio_qopt_offload { refcount_t users; struct tc_taprio_qopt_offload offload; }; static void taprio_calculate_gate_durations(struct taprio_sched *q, struct sched_gate_list *sched) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *cur; int tc; list_for_each_entry(entry, &sched->entries, list) { u32 gates_still_open = entry->gate_mask; /* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC. */ cur = entry; do { if (!gates_still_open) break; for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue; if (cur->gate_mask & BIT(tc)) entry->gate_duration[tc] += cur->interval; else gates_still_open &= ~BIT(tc); } cur = list_next_entry_circular(cur, &sched->entries, list); } while (cur != entry); /* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed. */ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] && sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; } } static bool taprio_entry_allows_tx(ktime_t skb_end_time, struct sched_entry *entry, int tc) { return ktime_before(skb_end_time, entry->gate_close_time[tc]); } static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) return KTIME_MAX; return ns_to_ktime(sched->base_time); } static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); switch (tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, tk_offset); } } static ktime_t taprio_get_time(const struct taprio_sched *q) { return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); } kfree(sched); } static void switch_schedules(struct taprio_sched *q, struct sched_gate_list **admin, struct sched_gate_list **oper) { rcu_assign_pointer(q->oper_sched, *admin); rcu_assign_pointer(q->admin_sched, NULL); if (*oper) call_rcu(&(*oper)->rcu, taprio_free_sched_cb); *oper = *admin; *admin = NULL; } /* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) { ktime_t time_since_sched_start; s32 time_elapsed; time_since_sched_start = ktime_sub(time, sched->base_time); div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); return time_elapsed; } static ktime_t get_interval_end_time(struct sched_gate_list *sched, struct sched_gate_list *admin, struct sched_entry *entry, ktime_t intv_start) { s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); ktime_t intv_end, cycle_ext_end, cycle_end; cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); intv_end = ktime_add_ns(intv_start, entry->interval); cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); if (ktime_before(intv_end, cycle_end)) return intv_end; else if (admin && admin != sched && ktime_after(admin->base_time, cycle_end) && ktime_before(admin->base_time, cycle_ext_end)) return admin->base_time; else return cycle_end; } static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } static int duration_to_length(struct taprio_sched *q, u64 duration) { return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); } /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed. */ static void taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); u32 max_sdu_from_user; u32 max_sdu_dynamic; u32 max_sdu; int tc; for (tc = 0; tc < num_tc; tc++) { max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; /* TC gate never closes => keep the queueMaxSDU * selected by the user */ if (sched->max_open_gate_duration[tc] == sched->cycle_time) { max_sdu_dynamic = U32_MAX; } else { u32 max_frm_len; max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative */ if (stab) { max_frm_len -= stab->szopts.overhead; max_frm_len = max_t(int, max_frm_len, dev->hard_header_len + 1); } max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu) max_sdu_dynamic = U32_MAX; } max_sdu = min(max_sdu_dynamic, max_sdu_from_user); if (max_sdu != U32_MAX) { sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; sched->max_sdu[tc] = max_sdu; } else { sched->max_frm_len[tc] = U32_MAX; /* never oversized */ sched->max_sdu[tc] = 0; } } } /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. */ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, struct Qdisc *sch, struct sched_gate_list *sched, struct sched_gate_list *admin, ktime_t time, ktime_t *interval_start, ktime_t *interval_end, bool validate_interval) { ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; struct sched_entry *entry = NULL, *entry_found = NULL; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool entry_available = false; s32 cycle_elapsed; int tc, n; tc = netdev_get_prio_tc_map(dev, skb->priority); packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); *interval_start = 0; *interval_end = 0; if (!sched) return NULL; cycle = sched->cycle_time; cycle_elapsed = get_cycle_time_elapsed(sched, time); curr_intv_end = ktime_sub_ns(time, cycle_elapsed); cycle_end = ktime_add_ns(curr_intv_end, cycle); list_for_each_entry(entry, &sched->entries, list) { curr_intv_start = curr_intv_end; curr_intv_end = get_interval_end_time(sched, admin, entry, curr_intv_start); if (ktime_after(curr_intv_start, cycle_end)) break; if (!(entry->gate_mask & BIT(tc)) || packet_transmit_time > entry->interval) continue; txtime = entry->next_txtime; if (ktime_before(txtime, time) || validate_interval) { transmit_end_time = ktime_add_ns(time, packet_transmit_time); if ((ktime_before(curr_intv_start, time) && ktime_before(transmit_end_time, curr_intv_end)) || (ktime_after(curr_intv_start, time) && !validate_interval)) { entry_found = entry; *interval_start = curr_intv_start; *interval_end = curr_intv_end; break; } else if (!entry_available && !validate_interval) { /* Here, we are just trying to find out the * first available interval in the next cycle. */ entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); } } else if (ktime_before(txtime, earliest_txtime) && !entry_available) { earliest_txtime = txtime; entry_found = entry; n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); *interval_start = ktime_add(curr_intv_start, n * cycle); *interval_end = ktime_add(curr_intv_end, n * cycle); } } return entry_found; } static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t interval_start, interval_end; struct sched_entry *entry; rcu_read_lock(); sched = rcu_dereference(q->oper_sched); admin = rcu_dereference(q->admin_sched); entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, &interval_start, &interval_end, true); rcu_read_unlock(); return entry; } /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct iphdr *iph; struct ipv6hdr _ipv6h; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return 0; } else if (iph->protocol != IPPROTO_TCP) { return 0; } } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { return 0; } return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window. */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed; now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); tcp_tstamp = get_tcp_tstamp(q, skb); minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched); if (admin && ktime_after(minimum_time, admin->base_time)) switch_schedules(q, &admin, &sched); /* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) { txtime = minimum_time; goto done; } len = qdisc_pkt_len(skb); packet_transmit_time = length_to_duration(q, len); do { sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, &interval_start, &interval_end, false); if (!entry) { txtime = 0; goto done; } txtime = entry->next_txtime; txtime = max_t(ktime_t, txtime, minimum_time); txtime = max_t(ktime_t, txtime, interval_start); if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; sched_changed = true; continue; } transmit_end_time = ktime_add(txtime, packet_transmit_time); minimum_time = transmit_end_time; /* Update the txtime of current entry to the next time it's * interval starts. */ if (ktime_after(transmit_end_time, interval_end)) entry->next_txtime = ktime_add(interval_start, sched->cycle_time); } while (sched_changed || ktime_after(transmit_end_time, interval_end)); entry->next_txtime = transmit_end_time; done: rcu_read_unlock(); return txtime; } /* Devices with full offload are expected to honor this in hardware */ static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false; u8 tc; tc = netdev_get_prio_tc_map(dev, prio); rcu_read_lock(); sched = rcu_dereference(q->oper_sched); if (sched && skb->len > sched->max_frm_len[tc]) exceeds = true; rcu_read_unlock(); return exceeds; } static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free); } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; int ret; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; slen += segs->len; /* FIXME: we should be segmenting to a smaller size * rather than dropping these */ if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) ret = qdisc_drop(segs, sch, to_free); else ret = taprio_enqueue_one(segs, sch, child, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { numsegs++; } } if (numsegs > 1) qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); consume_skb(skb); return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue; queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware. */ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child, to_free); return qdisc_drop(skb, sch, to_free); } return taprio_enqueue_one(skb, sch, child, to_free); } static struct sk_buff *taprio_peek(struct Qdisc *sch) { WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); return NULL; } static void taprio_set_budgets(struct taprio_sched *q, struct sched_gate_list *sched, struct sched_entry *entry) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); int tc, budget; for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time) budget = INT_MAX; else budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); atomic_set(&entry->budget[tc], budget); } } /* When an skb is sent, it consumes from the budget of all traffic classes */ static int taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc) { int tc, budget, new_budget = 0; for (tc = 0; tc < num_tc; tc++) { budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed) new_budget = budget; continue; } if (tc == tc_consumed) new_budget = atomic_sub_return(len, &entry->budget[tc]); else atomic_sub(len, &entry->budget[tc]); } return new_budget; } static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct Qdisc *child = q->qdiscs[txq]; int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; ktime_t guard; int prio; int len; u8 tc; if (unlikely(!child)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) goto skip_peek_checks; skb = child->ops->peek(child); if (!skb) return NULL; prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) return NULL; len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && !taprio_entry_allows_tx(guard, entry, tc)) return NULL; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL; skip_peek_checks: skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) { int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count; (*txq)++; if (*txq == offset + count) *txq = offset; } /* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin */ static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc; for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; do { skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], entry, gate_mask); taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); if (q->cur_txq[tc] >= dev->num_tx_queues) q->cur_txq[tc] = first_txq; if (skb) return skb; } while (q->cur_txq[tc] != first_txq); } return NULL; } /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not */ static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i; for (i = 0; i < dev->num_tx_queues; i++) { skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb; } return NULL; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done; if (static_branch_unlikely(&taprio_have_broken_mqprio) && !static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); } else if (static_branch_likely(&taprio_have_working_mqprio) && !static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio) skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: rcu_read_unlock(); return skb; } static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { if (list_is_last(&entry->list, &oper->entries)) return true; if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; } static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t end_time) { ktime_t next_base_time, extension_time; if (!admin) return false; next_base_time = sched_base_time(admin); /* This is the simple case, the end_time would fall after * the next schedule base_time. */ if (ktime_compare(next_base_time, end_time) <= 0) return true; /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change. */ if (ktime_compare(next_base_time, extension_time) <= 0) return true; return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t end_time; int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); oper = rcu_dereference_protected(q->oper_sched, lockdep_is_held(&q->current_entry_lock)); admin = rcu_dereference_protected(q->admin_sched, lockdep_is_held(&q->current_entry_lock)); if (!oper) switch_schedules(q, &admin, &oper); /* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization. */ if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, oper->cycle_time); } else { next = list_next_entry(entry, list); } end_time = ktime_add_ns(entry->end_time, next->interval); end_time = min_t(ktime_t, end_time, oper->cycle_end_time); for (tc = 0; tc < num_tc; tc++) { if (next->gate_duration[tc] == oper->cycle_time) next->gate_close_time[tc] = KTIME_MAX; else next->gate_close_time[tc] = ktime_add_ns(entry->end_time, next->gate_duration[tc]); } if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } next->end_time = end_time; taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); rcu_read_unlock(); return HRTIMER_RESTART; } static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, TC_QOPT_MAX_QUEUE), [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct netlink_range_validation_signed taprio_cycle_time_range = { .min = 0, .max = INT_MAX, }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS), [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, struct sched_entry *entry, struct netlink_ext_ack *extack) { int min_duration = length_to_duration(q, ETH_ZLEN); u32 interval = 0; if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) entry->command = nla_get_u8( tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) entry->gate_mask = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) interval = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); /* The interval should allow at least the minimum ethernet * frame to go out. */ if (interval < min_duration) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; return 0; } static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, struct sched_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_sched_entry(q, tb, entry, extack); } static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; int err, rem; int i = 0; if (!list) return -EINVAL; nla_for_each_nested(n, list, rem) { struct sched_entry *entry; if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } err = parse_sched_entry(q, n, entry, i, extack); if (err < 0) { kfree(entry); return err; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; } static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, struct sched_gate_list *new, struct netlink_ext_ack *extack) { int err = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); return -ENOTSUPP; } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); if (err < 0) return err; if (!new->cycle_time) { struct sched_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; } new->cycle_time = cycle; } if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); return -EINVAL; } taprio_calculate_gate_durations(q, new); return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack, u32 taprio_flags) { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt) { if (!dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } return 0; } /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } /* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges. */ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, extack); } static int taprio_get_start_time(struct Qdisc *sch, struct sched_gate_list *sched, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); ktime_t now, base, cycle; s64 n; base = sched_base_time(sched); now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error. */ if (WARN_ON(!cycle)) return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); return 0; } static void setup_first_end_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_end_time = ktime_add_ns(base, cycle); first->end_time = ktime_add_ns(base, first->interval); taprio_set_budgets(q, sched, first); for (tc = 0; tc < num_tc; tc++) { if (first->gate_duration[tc] == sched->cycle_time) first->gate_close_time[tc] = KTIME_MAX; else first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); } rcu_assign_pointer(q->current_entry, NULL); } static void taprio_start_sched(struct Qdisc *sch, ktime_t start, struct sched_gate_list *new) { struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) return; expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; /* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time. */ start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->picos_per_byte), ecmd.base.speed); } static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sched_gate_list *oper, *admin; struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; list_for_each_entry(q, &taprio_list, taprio_list) { if (dev != qdisc_dev(q->root)) continue; taprio_set_picos_per_byte(dev, q); stab = rtnl_dereference(q->root->stab); rcu_read_lock(); oper = rcu_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); admin = rcu_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); rcu_read_unlock(); break; } return NOTIFY_DONE; } static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); interval += entry->interval; } } static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), GFP_KERNEL); if (!__offload) return NULL; refcount_set(&__offload->users, 1); return &__offload->offload; } struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); refcount_inc(&__offload->users); return offload; } EXPORT_SYMBOL_GPL(taprio_offload_get); void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); if (!refcount_dec_and_test(&__offload->users)) return; kfree(__offload); } EXPORT_SYMBOL_GPL(taprio_offload_free); /* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) { u32 i, queue_mask = 0; for (i = 0; i < dev->num_tc; i++) { u32 offset, count; if (!(tc_mask & BIT(i))) continue; offset = dev->tc_to_txq[i].offset; count = dev->tc_to_txq[i].count; queue_mask |= GENMASK(offset + count - 1, offset); } return queue_mask; } static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, struct tc_taprio_qopt_offload *offload, const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; offload->base_time = sched->base_time; offload->cycle_time = sched->cycle_time; offload->cycle_time_extension = sched->cycle_time_extension; list_for_each_entry(entry, &sched->entries, list) { struct tc_taprio_sched_entry *e = &offload->entries[i]; e->command = entry->command; e->interval = entry->interval; if (caps->gate_mask_per_txq) e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); else e->gate_mask = entry->gate_mask; i++; } offload->num_entries = i; } static void taprio_detect_broken_mqprio(struct taprio_sched *q) { struct net_device *dev = qdisc_dev(q->root); struct tc_taprio_caps caps; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); q->broken_mqprio = caps.broken_mqprio; if (q->broken_mqprio) static_branch_inc(&taprio_have_broken_mqprio); else static_branch_inc(&taprio_have_working_mqprio); q->detected_mqprio = true; } static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) { if (!q->detected_mqprio) return; if (q->broken_mqprio) static_branch_dec(&taprio_have_broken_mqprio); else static_branch_dec(&taprio_have_working_mqprio); } static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; struct tc_taprio_caps caps; int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support taprio offload"); return -EOPNOTSUPP; } qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); if (!caps.supports_queue_max_sdu) { for (tc = 0; tc < TC_MAX_QUEUE; tc++) { if (q->max_sdu[tc]) { NL_SET_ERR_MSG_MOD(extack, "Device does not handle queueMaxSDU"); return -EOPNOTSUPP; } } } offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory for enabling offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done; } q->offloaded = true; done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid */ offload->extack = NULL; offload->mqprio.extack = NULL; taprio_offload_free(offload); return err; } static int taprio_disable_offload(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; int err; if (!q->offloaded) return 0; offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out; } q->offloaded = false; out: taprio_offload_free(offload); return err; } /* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory. */ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; struct kernel_ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out; } if (ops && ops->get_ts_info) err = ops->get_ts_info(dev, &info); if (err || info.phc_index < 0) { NL_SET_ERR_MSG(extack, "Device does not have a PTP clock"); err = -ENOTSUPP; goto out; } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. */ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) { NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); err = -ENOTSUPP; goto out; } switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } /* This pairs with READ_ONCE() in taprio_mono_to_any */ WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } /* Everything went ok, return success. */ err = 0; out: return err; } static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); int err, tc; u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); if (err < 0) return err; if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) { NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE; } max_sdu[tc] = val; } if (tb[TCA_TAPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } static int taprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; fp[tc] = q->fp[tc]; } nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) { err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) return err; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; q->fp[tc] = fp[tc]; if (fp[tc] != TC_FP_EXPRESS) have_preemption = true; } if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP; } if (!ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } } return err; } static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { int i; if (!mqprio || mqprio->num_tc != dev->num_tc) return -1; for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] || dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1; for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1; return 0; } static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; u32 taprio_flags; ktime_t start; int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); if (err < 0) return err; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); /* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0); /* txtime-assist and full offload are mutually exclusive */ if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) { NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); return -EINVAL; } if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) { NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP; } q->flags = taprio_flags; /* Needed for length_to_duration() during netlink attribute parsing */ taprio_set_picos_per_byte(dev, q); err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err; new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; } INIT_LIST_HEAD(&new_admin->entries); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) mqprio = NULL; if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; goto free_sched; } if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); q->cur_txq[i] = mqprio->offset[i]; } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; if (new_admin->num_entries == 0) { NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); err = -EINVAL; goto free_sched; } err = taprio_parse_clockid(sch, tb, extack); if (err < 0) goto free_sched; taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); err = -EINVAL; goto unlock; } q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock; } setup_txtime(q, new_admin, start); if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; new_admin = NULL; goto unlock; } /* Not going to race against advance_sched(), but still */ admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); taprio_start_sched(sch, start, new_admin); admin = rcu_replace_pointer(q->admin_sched, new_admin, lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } new_admin = NULL; err = 0; if (!stab) NL_SET_ERR_MSG_MOD(extack, "Size table not specified, frame length estimations may be inaccurate"); unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: if (new_admin) call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } static void taprio_reset(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i; hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) qdisc_reset(q->qdiscs[i]); } } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; unsigned int i; list_del(&q->taprio_list); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); } q->qdiscs = NULL; netdev_reset_tc(dev); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper) call_rcu(&oper->rcu, taprio_free_sched_cb); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i, tc; spin_lock_init(&q->current_entry_lock); hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; list_add(&q->taprio_list, &taprio_list); if (sch->parent != TC_H_ROOT) { NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; } if (!netif_is_multiqueue(dev)) { NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; } q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); if (!q->qdiscs) return -ENOMEM; if (!opt) return -EINVAL; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; if (i < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); q->qdiscs[i] = qdisc; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->fp[tc] = TC_FP_EXPRESS; taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); } static void taprio_attach(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; /* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ dev_queue_qdisc = sch; } old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto */ qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int taprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children. */ *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new) qdisc_refcount_inc(new); if (*old) qdisc_put(*old); } q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_entry(struct sk_buff *msg, const struct sched_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, entry->gate_mask)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; return nla_nest_end(msg, item); nla_put_failure: nla_nest_cancel(msg, item); return -1; } static int dump_schedule(struct sk_buff *msg, const struct sched_gate_list *root) { struct nlattr *entry_list; struct sched_entry *entry; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, root->base_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, root->cycle_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, root->cycle_time_extension, TCA_TAPRIO_PAD)) return -1; entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) goto error_nest; list_for_each_entry(entry, &root->entries, list) { if (dump_entry(msg, entry) < 0) goto error_nest; } nla_nest_end(msg, entry_list); return 0; error_nest: nla_nest_cancel(msg, entry_list); return -1; } static int taprio_dump_tc_entries(struct sk_buff *skb, struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; int tc; for (tc = 0; tc < TC_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, sched->max_sdu[tc])) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == TAPRIO_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) return -EMSGSIZE; return 0; } static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, struct tc_taprio_qopt_offload *offload, struct tc_taprio_qopt_stats *stats) { struct net_device *dev = qdisc_dev(sch); const struct net_device_ops *ops; struct sk_buff *skb = d->skb; struct nlattr *xstats; int err; ops = qdisc_dev(sch)->netdev_ops; /* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself */ if (!ops->ndo_setup_tc) return 0; memset(stats, 0xff, sizeof(*stats)); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err; xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err; if (taprio_put_stat(skb, stats->window_drops, TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || taprio_put_stat(skb, stats->tx_overruns, TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel; nla_nest_end(skb, xstats); return 0; err_cancel: nla_nest_cancel(skb, xstats); err: return -EMSGSIZE; } static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_STATS, }; return taprio_dump_xstats(sch, d, &offload, &offload.stats); } static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) goto options_error; if (q->txtime_delay && nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; rcu_read_lock(); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error_rcu; if (oper && dump_schedule(skb, oper)) goto options_error_rcu; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) goto options_error_rcu; if (dump_schedule(skb, admin)) goto admin_error; nla_nest_end(skb, sched_nest); done: rcu_read_unlock(); return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); options_error_rcu: rcu_read_unlock(); options_error: nla_nest_cancel(skb, nest); start_error: return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!taprio_queue_get(sch, ntx)) return 0; return ntx; } static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = child->handle; return 0; } static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops taprio_class_ops = { .graft = taprio_graft, .leaf = taprio_leaf, .find = taprio_find, .walk = taprio_walk, .dump = taprio_dump_class, .dump_stats = taprio_dump_class_stats, .select_queue = taprio_select_queue, }; static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .cl_ops = &taprio_class_ops, .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("taprio"); static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, }; static int __init taprio_module_init(void) { int err = register_netdevice_notifier(&taprio_device_notifier); if (err) return err; return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Time Aware Priority qdisc");
280 280 278 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/mm/mmap.c * * Copyright (C) 2012 ARM Ltd. */ #include <linux/io.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/types.h> #include <asm/cpufeature.h> #include <asm/page.h> static pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = PAGE_NONE, [VM_READ] = PAGE_READONLY, [VM_WRITE] = PAGE_READONLY, [VM_WRITE | VM_READ] = PAGE_READONLY, /* PAGE_EXECONLY if Enhanced PAN */ [VM_EXEC] = PAGE_READONLY_EXEC, [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, [VM_EXEC | VM_WRITE] = PAGE_READONLY_EXEC, [VM_EXEC | VM_WRITE | VM_READ] = PAGE_READONLY_EXEC, [VM_SHARED] = PAGE_NONE, [VM_SHARED | VM_READ] = PAGE_READONLY, [VM_SHARED | VM_WRITE] = PAGE_SHARED, [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, /* PAGE_EXECONLY if Enhanced PAN */ [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC }; /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. */ int valid_phys_addr_range(phys_addr_t addr, size_t size) { /* * Check whether addr is covered by a memory region without the * MEMBLOCK_NOMAP attribute, and whether that region covers the * entire range. In theory, this could lead to false negatives * if the range is covered by distinct but adjacent memory regions * that only differ in other attributes. However, few of such * attributes have been defined, and it is debatable whether it * follows that /dev/mem read() calls should be able traverse * such boundaries. */ return memblock_is_region_memory(addr, size) && memblock_is_map_memory(addr); } /* * Do not allow /dev/mem mappings beyond the supported physical range. */ int valid_mmap_phys_addr_range(unsigned long pfn, size_t size) { return !(((pfn << PAGE_SHIFT) + size) & ~PHYS_MASK); } static int __init adjust_protection_map(void) { /* * With Enhanced PAN we can honour the execute-only permissions as * there is no PAN override with such mappings. */ if (cpus_have_cap(ARM64_HAS_EPAN)) { protection_map[VM_EXEC] = PAGE_EXECONLY; protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } if (lpa2_is_enabled()) for (int i = 0; i < ARRAY_SIZE(protection_map); i++) pgprot_val(protection_map[i]) &= ~PTE_SHARED; return 0; } arch_initcall(adjust_protection_map); pgprot_t vm_get_page_prot(unsigned long vm_flags) { ptdesc_t prot; /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { prot = _PAGE_GCS_RO; } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); } if (vm_flags & VM_ARM64_BTI) prot |= PTE_GP; /* * There are two conditions required for returning a Normal Tagged * memory type: (1) the user requested it via PROT_MTE passed to * mmap() or mprotect() and (2) the corresponding vma supports MTE. We * register (1) as VM_MTE in the vma->vm_flags and (2) as * VM_MTE_ALLOWED. Note that the latter can only be set during the * mmap() call since mprotect() does not accept MAP_* flags. * Checking for VM_MTE only is sufficient since arch_validate_flags() * does not permit (VM_MTE & !VM_MTE_ALLOWED). */ if (vm_flags & VM_MTE) prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED); #ifdef CONFIG_ARCH_HAS_PKEYS if (system_supports_poe()) { if (vm_flags & VM_PKEY_BIT0) prot |= PTE_PO_IDX_0; if (vm_flags & VM_PKEY_BIT1) prot |= PTE_PO_IDX_1; if (vm_flags & VM_PKEY_BIT2) prot |= PTE_PO_IDX_2; } #endif return __pgprot(prot); } EXPORT_SYMBOL(vm_get_page_prot);
161 2 3 158 3 1 1 1 70 70 69 1 8 1 49 2 2 1 1 3 49 52 3 78 1 67 53 2 1 49 1 49 1 3 5 2 53 5 10 41 28 26 8 1 60 59 35 25 1 57 23 13 12 1 6 38 20 110 37 4 1 1 1 78 33 24 2 7 18 13 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 // SPDX-License-Identifier: GPL-2.0-only /* * VGIC: KVM DEVICE API * * Copyright (C) 2015 ARM Ltd. * Author: Marc Zyngier <marc.zyngier@arm.com> */ #include <linux/irqchip/arm-gic-v3.h> #include <linux/kvm_host.h> #include <kvm/arm_vgic.h> #include <linux/uaccess.h> #include <asm/kvm_mmu.h> #include <asm/cputype.h> #include "vgic.h" /* common helpers */ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr, phys_addr_t addr, phys_addr_t alignment, phys_addr_t size) { if (!IS_VGIC_ADDR_UNDEF(ioaddr)) return -EEXIST; if (!IS_ALIGNED(addr, alignment) || !IS_ALIGNED(size, alignment)) return -EINVAL; if (addr + size < addr) return -EINVAL; if (addr & ~kvm_phys_mask(&kvm->arch.mmu) || (addr + size) > kvm_phys_size(&kvm->arch.mmu)) return -E2BIG; return 0; } static int vgic_check_type(struct kvm *kvm, int type_needed) { if (kvm->arch.vgic.vgic_model != type_needed) return -ENODEV; else return 0; } int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { struct vgic_dist *vgic = &kvm->arch.vgic; int r; mutex_lock(&kvm->arch.config_lock); switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); if (!r) r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr, SZ_4K, KVM_VGIC_V2_DIST_SIZE); if (!r) vgic->vgic_dist_base = dev_addr->addr; break; case KVM_VGIC_V2_ADDR_TYPE_CPU: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); if (!r) r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr, SZ_4K, KVM_VGIC_V2_CPU_SIZE); if (!r) vgic->vgic_cpu_base = dev_addr->addr; break; default: r = -ENODEV; } mutex_unlock(&kvm->arch.config_lock); return r; } /** * kvm_vgic_addr - set or get vgic VM base addresses * @kvm: pointer to the vm struct * @attr: pointer to the attribute being retrieved/updated * @write: if true set the address in the VM address space, if false read the * address * * Set or get the vgic base addresses for the distributor and the virtual CPU * interface in the VM physical address space. These addresses are properties * of the emulated core/SoC and therefore user space initially knows this * information. * Check them for sanity (alignment, double assignment). We can't check for * overlapping regions in case of a virtual GICv3 here, since we don't know * the number of VCPUs yet, so we defer this check to map_resources(). */ static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write) { u64 __user *uaddr = (u64 __user *)attr->addr; struct vgic_dist *vgic = &kvm->arch.vgic; phys_addr_t *addr_ptr, alignment, size; u64 undef_value = VGIC_ADDR_UNDEF; u64 addr; int r; /* Reading a redistributor region addr implies getting the index */ if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION) if (get_user(addr, uaddr)) return -EFAULT; /* * Since we can't hold config_lock while registering the redistributor * iodevs, take the slots_lock immediately. */ mutex_lock(&kvm->slots_lock); switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_4K; size = KVM_VGIC_V2_DIST_SIZE; break; case KVM_VGIC_V2_ADDR_TYPE_CPU: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_cpu_base; alignment = SZ_4K; size = KVM_VGIC_V2_CPU_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); addr_ptr = &vgic->vgic_dist_base; alignment = SZ_64K; size = KVM_VGIC_V3_DIST_SIZE; break; case KVM_VGIC_V3_ADDR_TYPE_REDIST: { struct vgic_redist_region *rdreg; r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); if (r) break; if (write) { r = vgic_v3_set_redist_base(kvm, 0, addr, 0); goto out; } rdreg = list_first_entry_or_null(&vgic->rd_regions, struct vgic_redist_region, list); if (!rdreg) addr_ptr = &undef_value; else addr_ptr = &rdreg->base; break; } case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: { struct vgic_redist_region *rdreg; u8 index; r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3); if (r) break; index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK; if (write) { gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK; u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr); u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr); if (!count || flags) r = -EINVAL; else r = vgic_v3_set_redist_base(kvm, index, base, count); goto out; } rdreg = vgic_v3_rdist_region_from_index(kvm, index); if (!rdreg) { r = -ENOENT; goto out; } addr = index; addr |= rdreg->base; addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; goto out; } default: r = -ENODEV; } if (r) goto out; mutex_lock(&kvm->arch.config_lock); if (write) { r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size); if (!r) *addr_ptr = addr; } else { addr = *addr_ptr; } mutex_unlock(&kvm->arch.config_lock); out: mutex_unlock(&kvm->slots_lock); if (!r && !write) r = put_user(addr, uaddr); return r; } static int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: r = kvm_vgic_addr(dev->kvm, attr, true); return (r == -ENODEV) ? -ENXIO : r; case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; u32 val; int ret = 0; if (get_user(val, uaddr)) return -EFAULT; /* * We require: * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs * - at most 1024 interrupts * - a multiple of 32 interrupts */ if (val < (VGIC_NR_PRIVATE_IRQS + 32) || val > VGIC_MAX_RESERVED || (val & 31)) return -EINVAL; mutex_lock(&dev->kvm->arch.config_lock); /* * Either userspace has already configured NR_IRQS or * the vgic has already been initialized and vgic_init() * supplied a default amount of SPIs. */ if (dev->kvm->arch.vgic.nr_spis) ret = -EBUSY; else dev->kvm->arch.vgic.nr_spis = val - VGIC_NR_PRIVATE_IRQS; mutex_unlock(&dev->kvm->arch.config_lock); return ret; } case KVM_DEV_ARM_VGIC_GRP_CTRL: { switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: mutex_lock(&dev->kvm->arch.config_lock); r = vgic_init(dev->kvm); mutex_unlock(&dev->kvm->arch.config_lock); return r; case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: /* * OK, this one isn't common at all, but we * want to handle all control group attributes * in a single place. */ if (vgic_check_type(dev->kvm, KVM_DEV_TYPE_ARM_VGIC_V3)) return -ENXIO; mutex_lock(&dev->kvm->lock); if (kvm_trylock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } mutex_lock(&dev->kvm->arch.config_lock); r = vgic_v3_save_pending_tables(dev->kvm); mutex_unlock(&dev->kvm->arch.config_lock); kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); return r; } break; } } return -ENXIO; } static int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = -ENXIO; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: r = kvm_vgic_addr(dev->kvm, attr, false); return (r == -ENODEV) ? -ENXIO : r; case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; r = put_user(dev->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS, uaddr); break; } } return r; } static int vgic_create(struct kvm_device *dev, u32 type) { return kvm_vgic_create(dev->kvm, type); } static void vgic_destroy(struct kvm_device *dev) { kfree(dev); } int kvm_register_vgic_device(unsigned long type) { int ret = -ENODEV; switch (type) { case KVM_DEV_TYPE_ARM_VGIC_V2: ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2); break; case KVM_DEV_TYPE_ARM_VGIC_V3: ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3); if (ret) break; ret = kvm_vgic_register_its_device(); break; } return ret; } int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr) { int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr); reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid); if (!reg_attr->vcpu) return -EINVAL; return 0; } /** * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state * * @dev: kvm device handle * @attr: kvm device attribute * @is_write: true if userspace is writing a register */ static int vgic_v2_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; int ret; u32 val; ret = vgic_v2_parse_attr(dev, attr, &reg_attr); if (ret) return ret; vcpu = reg_attr.vcpu; addr = reg_attr.addr; if (is_write) if (get_user(val, uaddr)) return -EFAULT; mutex_lock(&dev->kvm->lock); if (kvm_trylock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } mutex_lock(&dev->kvm->arch.config_lock); ret = vgic_init(dev->kvm); if (ret) goto out; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, &val); break; default: ret = -EINVAL; break; } out: mutex_unlock(&dev->kvm->arch.config_lock); kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && !is_write) ret = put_user(val, uaddr); return ret; } static int vgic_v2_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_attr_regs_access(dev, attr, true); default: return vgic_set_common_attr(dev, attr); } } static int vgic_v2_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_attr_regs_access(dev, attr, false); default: return vgic_get_common_attr(dev, attr); } } static int vgic_v2_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: case KVM_VGIC_V2_ADDR_TYPE_CPU: return 0; } break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: return 0; case KVM_DEV_ARM_VGIC_GRP_CTRL: switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: return 0; } } return -ENXIO; } struct kvm_device_ops kvm_arm_vgic_v2_ops = { .name = "kvm-arm-vgic-v2", .create = vgic_create, .destroy = vgic_destroy, .set_attr = vgic_v2_set_attr, .get_attr = vgic_v2_get_attr, .has_attr = vgic_v2_has_attr, }; int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, struct vgic_reg_attr *reg_attr) { unsigned long vgic_mpidr, mpidr_reg; /* * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group, * attr might not hold MPIDR. Hence assume vcpu0. */ if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) { vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >> KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT; mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr); reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg); } else { reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0); } if (!reg_attr->vcpu) return -EINVAL; reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK; return 0; } /* * Allow access to certain ID-like registers prior to VGIC initialization, * thereby allowing the VMM to provision the features / sizing of the VGIC. */ static bool reg_allowed_pre_init(struct kvm_device_attr *attr) { if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) return false; switch (attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK) { case GICD_IIDR: case GICD_TYPER2: return true; default: return false; } } /* * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state * * @dev: kvm device handle * @attr: kvm device attribute * @is_write: true if userspace is writing a register */ static int vgic_v3_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, bool is_write) { struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; bool uaccess; u32 val; int ret; ret = vgic_v3_parse_attr(dev, attr, &reg_attr); if (ret) return ret; vcpu = reg_attr.vcpu; addr = reg_attr.addr; switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: /* Sysregs uaccess is performed by the sysreg handling code */ uaccess = false; break; default: uaccess = true; } if (uaccess && is_write) { u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; if (get_user(val, uaddr)) return -EFAULT; } mutex_lock(&dev->kvm->lock); if (kvm_trylock_all_vcpus(dev->kvm)) { mutex_unlock(&dev->kvm->lock); return -EBUSY; } mutex_lock(&dev->kvm->arch.config_lock); if (!(vgic_initialized(dev->kvm) || reg_allowed_pre_init(attr))) { ret = -EBUSY; goto out; } switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write); break; case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { unsigned int info, intid; info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { intid = attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, intid, &val); } else { ret = -EINVAL; } break; } default: ret = -EINVAL; break; } out: mutex_unlock(&dev->kvm->arch.config_lock); kvm_unlock_all_vcpus(dev->kvm); mutex_unlock(&dev->kvm->lock); if (!ret && uaccess && !is_write) { u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; ret = put_user(val, uaddr); } return ret; } static int vgic_v3_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, true); case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { u32 __user *uaddr = (u32 __user *)attr->addr; u32 val; if (get_user(val, uaddr)) return -EFAULT; guard(mutex)(&dev->kvm->arch.config_lock); if (vgic_initialized(dev->kvm)) return -EBUSY; if (!irq_is_ppi(val)) return -EINVAL; dev->kvm->arch.vgic.mi_intid = val; return 0; } default: return vgic_set_common_attr(dev, attr); } } static int vgic_v3_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, false); case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; guard(mutex)(&dev->kvm->arch.config_lock); return put_user(dev->kvm->arch.vgic.mi_intid, uaddr); } default: return vgic_get_common_attr(dev, attr); } } static int vgic_v3_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_ADDR: switch (attr->attr) { case KVM_VGIC_V3_ADDR_TYPE_DIST: case KVM_VGIC_V3_ADDR_TYPE_REDIST: case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION: return 0; } break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: return vgic_v3_has_attr_regs(dev, attr); case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: return 0; case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) == VGIC_LEVEL_INFO_LINE_LEVEL) return 0; break; } case KVM_DEV_ARM_VGIC_GRP_CTRL: switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: return 0; case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: return 0; } } return -ENXIO; } struct kvm_device_ops kvm_arm_vgic_v3_ops = { .name = "kvm-arm-vgic-v3", .create = vgic_create, .destroy = vgic_destroy, .set_attr = vgic_v3_set_attr, .get_attr = vgic_v3_get_attr, .has_attr = vgic_v3_has_attr, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2001 Momchil Velikov * Portions Copyright (C) 2001 Christoph Hellwig * Copyright (C) 2006 Nick Piggin * Copyright (C) 2012 Konstantin Khlebnikov */ #ifndef _LINUX_RADIX_TREE_H #define _LINUX_RADIX_TREE_H #include <linux/bitops.h> #include <linux/gfp_types.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/math.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/xarray.h> #include <linux/local_lock.h> /* Keep unconverted code working */ #define radix_tree_root xarray #define radix_tree_node xa_node struct radix_tree_preload { local_lock_t lock; unsigned nr; /* nodes->parent points to next preallocated node */ struct radix_tree_node *nodes; }; DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads); /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: * * 00 - data pointer * 10 - internal entry * x1 - value entry * * The internal entry may be a pointer to the next level in the tree, a * sibling entry, or an indicator that the entry in this slot has been moved * to another location in the tree and the lookup should be restarted. While * NULL fits the 'data pointer' pattern, it means that there is no entry in * the tree for this index (no matter what level of the tree it is found at). * This means that storing a NULL entry in the tree is the same as deleting * the entry from the tree. */ #define RADIX_TREE_ENTRY_MASK 3UL #define RADIX_TREE_INTERNAL_NODE 2UL static inline bool radix_tree_is_internal_node(void *ptr) { return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) == RADIX_TREE_INTERNAL_NODE; } /*** radix-tree API starts here ***/ #define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) #define RADIX_TREE_MAX_TAGS XA_MAX_MARKS #define RADIX_TREE_TAG_LONGS XA_MARK_LONGS #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) /* The IDR tag is stored in the low bits of xa_flags */ #define ROOT_IS_IDR ((__force gfp_t)4) /* The top bits of xa_flags are used to store the root tags */ #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) #define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask) #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(name, mask) #define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask) static inline bool radix_tree_empty(const struct radix_tree_root *root) { return root->xa_head == NULL; } /** * struct radix_tree_iter - radix tree iterator state * * @index: index of current slot * @next_index: one beyond the last index for this chunk * @tags: bit-mask for tag-iterating * @node: node that contains current slot * * This radix tree iterator works in terms of "chunks" of slots. A chunk is a * subinterval of slots contained within one radix tree leaf node. It is * described by a pointer to its first slot and a struct radix_tree_iter * which holds the chunk's position in the tree and its size. For tagged * iteration radix_tree_iter also holds the slots' bit-mask for one chosen * radix tree tag. */ struct radix_tree_iter { unsigned long index; unsigned long next_index; unsigned long tags; struct radix_tree_node *node; }; /** * Radix-tree synchronization * * The radix-tree API requires that users provide all synchronisation (with * specific exceptions, noted below). * * Synchronization of access to the data items being stored in the tree, and * management of their lifetimes must be completely managed by API users. * * For API usage, in general, * - any function _modifying_ the tree or tags (inserting or deleting * items, setting or clearing tags) must exclude other modifications, and * exclude any functions reading the tree. * - any function _reading_ the tree or tags (looking up items or tags, * gang lookups) must exclude modifications to the tree, but may occur * concurrently with other readers. * * The notable exceptions to this rule are the following functions: * __radix_tree_lookup * radix_tree_lookup * radix_tree_lookup_slot * radix_tree_tag_get * radix_tree_gang_lookup * radix_tree_gang_lookup_tag * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * * The first 7 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. * * It is still required that the caller manage the synchronization and lifetimes * of the items. So if RCU lock-free lookups are used, typically this would mean * that the items have their own locks, or are amenable to lock-free access; and * that the items are freed by RCU (or only freed after having been deleted from * the radix tree *and* a synchronize_rcu() grace period). * * (Note, rcu_assign_pointer and rcu_dereference are not needed to control * access to data items when inserting into or looking up from the radix tree) * * Note that the value returned by radix_tree_tag_get() may not be relied upon * if only the RCU read lock is held. Functions to set/clear tags and to * delete nodes running concurrently with it may affect its result such that * two consecutive reads in the same locked section may return different * values. If reliability is required, modification functions must also be * excluded from concurrency. * * radix_tree_tagged is able to be called without locking or RCU. */ /** * radix_tree_deref_slot - dereference a slot * @slot: slot pointer, returned by radix_tree_lookup_slot * * For use with radix_tree_lookup_slot(). Caller must hold tree at least read * locked across slot lookup and dereference. Not required if write lock is * held (ie. items cannot be concurrently inserted). * * radix_tree_deref_retry must be used to confirm validity of the pointer if * only the read lock is held. * * Return: entry stored in that slot. */ static inline void *radix_tree_deref_slot(void __rcu **slot) { return rcu_dereference(*slot); } /** * radix_tree_deref_slot_protected - dereference a slot with tree lock held * @slot: slot pointer, returned by radix_tree_lookup_slot * * Similar to radix_tree_deref_slot. The caller does not hold the RCU read * lock but it must hold the tree lock to prevent parallel updates. * * Return: entry stored in that slot. */ static inline void *radix_tree_deref_slot_protected(void __rcu **slot, spinlock_t *treelock) { return rcu_dereference_protected(*slot, lockdep_is_held(treelock)); } /** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot * Returns: 0 if retry is not required, otherwise retry is required * * radix_tree_deref_retry must be used with radix_tree_deref_slot. */ static inline int radix_tree_deref_retry(void *arg) { return unlikely(radix_tree_is_internal_node(arg)); } /** * radix_tree_exception - radix_tree_deref_slot returned either exception? * @arg: value returned by radix_tree_deref_slot * Returns: 0 if well-aligned pointer, non-0 if either kind of exception. */ static inline int radix_tree_exception(void *arg) { return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } int radix_tree_insert(struct radix_tree_root *, unsigned long index, void *); void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp); void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, void __rcu **slot, void *entry); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *, unsigned long index, unsigned int tag); void *radix_tree_tag_clear(struct radix_tree_root *, unsigned long index, unsigned int tag); int radix_tree_tag_get(const struct radix_tree_root *, unsigned long index, unsigned int tag); void radix_tree_iter_tag_clear(struct radix_tree_root *, const struct radix_tree_iter *iter, unsigned int tag); unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items, unsigned int tag); unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *, void __rcu ***results, unsigned long first_index, unsigned int max_items, unsigned int tag); int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag); static inline void radix_tree_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max); enum { RADIX_TREE_ITER_TAG_MASK = 0x0f, /* tag index in lower nybble */ RADIX_TREE_ITER_TAGGED = 0x10, /* lookup tagged slots */ RADIX_TREE_ITER_CONTIG = 0x20, /* stop at first hole */ }; /** * radix_tree_iter_init - initialize radix tree iterator * * @iter: pointer to iterator state * @start: iteration starting index * Returns: NULL */ static __always_inline void __rcu ** radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start) { /* * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it * in the case of a successful tagged chunk lookup. If the lookup was * unsuccessful or non-tagged then nobody cares about ->tags. * * Set index to zero to bypass next_index overflow protection. * See the comment in radix_tree_next_chunk() for details. */ iter->index = 0; iter->next_index = start; return NULL; } /** * radix_tree_next_chunk - find next chunk of slots for iteration * * @root: radix tree root * @iter: iterator state * @flags: RADIX_TREE_ITER_* flags and tag index * Returns: pointer to chunk first slot, or NULL if there no more left * * This function looks up the next chunk in the radix tree starting from * @iter->next_index. It returns a pointer to the chunk's first slot. * Also it fills @iter with data about chunk: position in the tree (index), * its end (next_index), and constructs a bit mask for tagged iterating (tags). */ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *, struct radix_tree_iter *iter, unsigned flags); /** * radix_tree_iter_lookup - look up an index in the radix tree * @root: radix tree root * @iter: iterator state * @index: key to look up * * If @index is present in the radix tree, this function returns the slot * containing it and updates @iter to describe the entry. If @index is not * present, it returns NULL. */ static inline void __rcu ** radix_tree_iter_lookup(const struct radix_tree_root *root, struct radix_tree_iter *iter, unsigned long index) { radix_tree_iter_init(iter, index); return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG); } /** * radix_tree_iter_retry - retry this chunk of the iteration * @iter: iterator state * * If we iterate over a tree protected only by the RCU lock, a race * against deletion or creation may result in seeing a slot for which * radix_tree_deref_retry() returns true. If so, call this function * and continue the iteration. */ static inline __must_check void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter) { iter->next_index = iter->index; iter->tags = 0; return NULL; } static inline unsigned long __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) { return iter->index + slots; } /** * radix_tree_iter_resume - resume iterating when the chunk may be invalid * @slot: pointer to current slot * @iter: iterator state * Returns: New slot pointer * * If the iterator needs to release then reacquire a lock, the chunk may * have been invalidated by an insertion or deletion. Call this function * before releasing the lock to continue the iteration from the next index. */ void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter); /** * radix_tree_chunk_size - get current chunk size * * @iter: pointer to radix tree iterator * Returns: current chunk size */ static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { return iter->next_index - iter->index; } /** * radix_tree_next_slot - find next slot in chunk * * @slot: pointer to current slot * @iter: pointer to iterator state * @flags: RADIX_TREE_ITER_*, should be constant * Returns: pointer to next slot, or NULL if there no more left * * This function updates @iter->index in the case of a successful lookup. * For tagged lookup it also eats @iter->tags. * * There are several cases where 'slot' can be passed in as NULL to this * function. These cases result from the use of radix_tree_iter_resume() or * radix_tree_iter_retry(). In these cases we don't end up dereferencing * 'slot' because either: * a) we are doing tagged iteration and iter->tags has been set to 0, or * b) we are doing non-tagged iteration, and iter->index and iter->next_index * have been set up so that radix_tree_chunk_size() returns 1 or 0. */ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, struct radix_tree_iter *iter, unsigned flags) { if (flags & RADIX_TREE_ITER_TAGGED) { iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; if (likely(iter->tags & 1ul)) { iter->index = __radix_tree_iter_add(iter, 1); slot++; goto found; } if (!(flags & RADIX_TREE_ITER_CONTIG)) { unsigned offset = __ffs(iter->tags); iter->tags >>= offset++; iter->index = __radix_tree_iter_add(iter, offset); slot += offset; goto found; } } else { long count = radix_tree_chunk_size(iter); while (--count > 0) { slot++; iter->index = __radix_tree_iter_add(iter, 1); if (likely(*slot)) goto found; if (flags & RADIX_TREE_ITER_CONTIG) { /* forbid switching to the next chunk */ iter->next_index = 0; break; } } } return NULL; found: return slot; } /** * radix_tree_for_each_slot - iterate over non-empty slots * * @slot: the void** variable for pointer to slot * @root: the struct radix_tree_root pointer * @iter: the struct radix_tree_iter pointer * @start: iteration starting index * * @slot points to radix tree slot, @iter->index contains its index. */ #define radix_tree_for_each_slot(slot, root, iter, start) \ for (slot = radix_tree_iter_init(iter, start) ; \ slot || (slot = radix_tree_next_chunk(root, iter, 0)) ; \ slot = radix_tree_next_slot(slot, iter, 0)) /** * radix_tree_for_each_tagged - iterate over tagged slots * * @slot: the void** variable for pointer to slot * @root: the struct radix_tree_root pointer * @iter: the struct radix_tree_iter pointer * @start: iteration starting index * @tag: tag index * * @slot points to radix tree slot, @iter->index contains its index. */ #define radix_tree_for_each_tagged(slot, root, iter, start, tag) \ for (slot = radix_tree_iter_init(iter, start) ; \ slot || (slot = radix_tree_next_chunk(root, iter, \ RADIX_TREE_ITER_TAGGED | tag)) ; \ slot = radix_tree_next_slot(slot, iter, \ RADIX_TREE_ITER_TAGGED | tag)) #endif /* _LINUX_RADIX_TREE_H */
349 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 /* * include/linux/topology.h * * Written by: Matthew Dobson, IBM Corporation * * Copyright (C) 2002, IBM Corp. * * All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Send feedback to <colpatch@us.ibm.com> */ #ifndef _LINUX_TOPOLOGY_H #define _LINUX_TOPOLOGY_H #include <linux/arch_topology.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/bitops.h> #include <linux/mmzone.h> #include <linux/smp.h> #include <linux/percpu.h> #include <asm/topology.h> #ifndef nr_cpus_node #define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node)) #endif int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 #define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif #ifndef RECLAIM_DISTANCE /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) * and node_reclaim_mode is enabled then the VM will only call node_reclaim() * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif /* * The following tunable allows platforms to override the default node * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are * sufficiently fast that the default value actually hurts * performance. * * AMD EPYC machines use this because even though the 2-hop distance * is 32 (3.2x slower than a local memory access) performance actually * *improves* if allowed to reclaim memory and load balance tasks * between NUMA nodes 2-hops apart. */ extern int __read_mostly node_reclaim_distance; #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DECLARE_PER_CPU(int, numa_node); #ifndef numa_node_id /* Returns the number of the current Node. */ static inline int numa_node_id(void) { return raw_cpu_read(numa_node); } #endif #ifndef cpu_to_node static inline int cpu_to_node(int cpu) { return per_cpu(numa_node, cpu); } #endif #ifndef set_numa_node static inline void set_numa_node(int node) { this_cpu_write(numa_node, node); } #endif #ifndef set_cpu_numa_node static inline void set_cpu_numa_node(int cpu, int node) { per_cpu(numa_node, cpu) = node; } #endif #else /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */ /* Returns the number of the current Node. */ #ifndef numa_node_id static inline int numa_node_id(void) { return cpu_to_node(raw_smp_processor_id()); } #endif #endif /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */ #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); } #endif #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return raw_cpu_read(_numa_mem_); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return per_cpu(_numa_mem_, cpu); } #endif #ifndef set_cpu_numa_mem static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; } #endif #else /* !CONFIG_HAVE_MEMORYLESS_NODES */ #ifndef numa_mem_id /* Returns the number of the nearest Node with memory */ static inline int numa_mem_id(void) { return numa_node_id(); } #endif #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { return cpu_to_node(cpu); } #endif #endif /* [!]CONFIG_HAVE_MEMORYLESS_NODES */ #if defined(topology_die_id) && defined(topology_die_cpumask) #define TOPOLOGY_DIE_SYSFS #endif #if defined(topology_cluster_id) && defined(topology_cluster_cpumask) #define TOPOLOGY_CLUSTER_SYSFS #endif #if defined(topology_book_id) && defined(topology_book_cpumask) #define TOPOLOGY_BOOK_SYSFS #endif #if defined(topology_drawer_id) && defined(topology_drawer_cpumask) #define TOPOLOGY_DRAWER_SYSFS #endif #ifndef topology_physical_package_id #define topology_physical_package_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_die_id #define topology_die_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_cluster_id #define topology_cluster_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_core_id #define topology_core_id(cpu) ((void)(cpu), 0) #endif #ifndef topology_book_id #define topology_book_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_drawer_id #define topology_drawer_id(cpu) ((void)(cpu), -1) #endif #ifndef topology_ppin #define topology_ppin(cpu) ((void)(cpu), 0ull) #endif #ifndef topology_sibling_cpumask #define topology_sibling_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_core_cpumask #define topology_core_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_cluster_cpumask #define topology_cluster_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_die_cpumask #define topology_die_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_book_cpumask #define topology_book_cpumask(cpu) cpumask_of(cpu) #endif #ifndef topology_drawer_cpumask #define topology_drawer_cpumask(cpu) cpumask_of(cpu) #endif #if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask) static inline const struct cpumask *cpu_smt_mask(int cpu) { return topology_sibling_cpumask(cpu); } #endif #ifndef topology_is_primary_thread static inline bool topology_is_primary_thread(unsigned int cpu) { /* * When disabling SMT, the primary thread of the SMT will remain * enabled/active. Architectures that have a special primary thread * (e.g. x86) need to override this function. Otherwise the first * thread in the SMT can be made the primary thread. * * The sibling cpumask of an offline CPU always contains the CPU * itself on architectures using the implementation of * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology. * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for * building their topology have to check whether to use this default * implementation or to override it. */ return cpu == cpumask_first(topology_sibling_cpumask(cpu)); } #define topology_is_primary_thread topology_is_primary_thread #endif static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } #ifdef CONFIG_NUMA int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node); extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops); #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * sched_numa_hop_mask(unsigned int node, unsigned int hops) { return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NUMA */ /** * for_each_node_numadist() - iterate over nodes in increasing distance * order, starting from a given node * @node: the iteration variable and the starting node. * @unvisited: a nodemask to keep track of the unvisited nodes. * * This macro iterates over NUMA node IDs in increasing distance from the * starting @node and yields MAX_NUMNODES when all the nodes have been * visited. * * Note that by the time the loop completes, the @unvisited nodemask will * be fully cleared, unless the loop exits early. * * The difference between for_each_node() and for_each_node_numadist() is * that the former allows to iterate over nodes in numerical order, whereas * the latter iterates over nodes in increasing order of distance. * * This complexity of this iterator is O(N^2), where N represents the * number of nodes, as each iteration involves scanning all nodes to * find the one with the shortest distance. * * Requires rcu_lock to be held. */ #define for_each_node_numadist(node, unvisited) \ for (int __start = (node), \ (node) = nearest_node_nodemask((__start), &(unvisited)); \ (node) < MAX_NUMNODES; \ node_clear((node), (unvisited)), \ (node) = nearest_node_nodemask((__start), &(unvisited))) /** * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance * from a given node. * @mask: the iteration variable. * @node: the NUMA node to start the search from. * * Requires rcu_lock to be held. * * Yields cpu_online_mask for @node == NUMA_NO_NODE. */ #define for_each_numa_hop_mask(mask, node) \ for (unsigned int __hops = 0; \ mask = (node != NUMA_NO_NODE || __hops) ? \ sched_numa_hop_mask(node, __hops) : \ cpu_online_mask, \ !IS_ERR_OR_NULL(mask); \ __hops++) DECLARE_PER_CPU(unsigned long, cpu_scale); static inline unsigned long topology_get_cpu_scale(int cpu) { return per_cpu(cpu_scale, cpu); } void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); #endif /* _LINUX_TOPOLOGY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_COMPAT_H #define __ASM_COMPAT_H #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; #define __compat_uid_t __compat_uid_t typedef u16 __compat_uid_t; typedef u16 __compat_gid_t; #define compat_ipc_pid_t compat_ipc_pid_t typedef u16 compat_ipc_pid_t; #define compat_statfs compat_statfs #include <asm-generic/compat.h> #ifdef CONFIG_COMPAT /* * Architecture specific compatibility types */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #ifdef __AARCH64EB__ #define COMPAT_UTS_MACHINE "armv8b\0\0" #else #define COMPAT_UTS_MACHINE "armv8l\0\0" #endif typedef u16 __compat_uid16_t; typedef u16 __compat_gid16_t; typedef s32 compat_nlink_t; struct compat_stat { #ifdef __AARCH64EB__ short st_dev; short __pad1; #else compat_dev_t st_dev; #endif compat_ino_t st_ino; compat_mode_t st_mode; compat_ushort_t st_nlink; __compat_uid16_t st_uid; __compat_gid16_t st_gid; #ifdef __AARCH64EB__ short st_rdev; short __pad2; #else compat_dev_t st_rdev; #endif compat_off_t st_size; compat_off_t st_blksize; compat_off_t st_blocks; old_time32_t st_atime; compat_ulong_t st_atime_nsec; old_time32_t st_mtime; compat_ulong_t st_mtime_nsec; old_time32_t st_ctime; compat_ulong_t st_ctime_nsec; compat_ulong_t __unused4[2]; }; struct compat_statfs { int f_type; int f_bsize; int f_blocks; int f_bfree; int f_bavail; int f_files; int f_ffree; compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; int f_flags; int f_spare[4]; }; #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) #define COMPAT_MINSIGSTKSZ 2048 static inline int is_compat_task(void) { return test_thread_flag(TIF_32BIT); } static inline int is_compat_thread(struct thread_info *thread) { return test_ti_thread_flag(thread, TIF_32BIT); } long compat_arm_syscall(struct pt_regs *regs, int scno); #else /* !CONFIG_COMPAT */ static inline int is_compat_thread(struct thread_info *thread) { return 0; } #endif /* CONFIG_COMPAT */ #endif /* __ASM_COMPAT_H */
166 166 3 165 166 154 3 3 3 3 154 3 192 3 154 192 193 192 193 193 52 165 166 3 3 3 53 52 53 4 4 4 53 53 52 4 53 53 4 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the SID table type. * * Original author: Stephen Smalley, <stephen.smalley.work@gmail.com> * Author: Ondrej Mosnacek, <omosnacek@gmail.com> * * Copyright (C) 2018 Red Hat, Inc. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <asm/barrier.h> #include "flask.h" #include "security.h" #include "sidtab.h" #include "services.h" struct sidtab_str_cache { struct rcu_head rcu_member; struct list_head lru_member; struct sidtab_entry *parent; u32 len; char str[] __counted_by(len); }; #define index_to_sid(index) ((index) + SECINITSID_NUM + 1) #define sid_to_index(sid) ((sid) - (SECINITSID_NUM + 1)) int sidtab_init(struct sidtab *s) { u32 i; memset(s->roots, 0, sizeof(s->roots)); for (i = 0; i < SECINITSID_NUM; i++) s->isids[i].set = 0; s->frozen = false; s->count = 0; s->convert = NULL; hash_init(s->context_to_sid); spin_lock_init(&s->lock); #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 s->cache_free_slots = CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE; INIT_LIST_HEAD(&s->cache_lru_list); spin_lock_init(&s->cache_lock); #endif return 0; } static u32 context_to_sid(struct sidtab *s, struct context *context, u32 hash) { struct sidtab_entry *entry; u32 sid = 0; rcu_read_lock(); hash_for_each_possible_rcu(s->context_to_sid, entry, list, hash) { if (entry->hash != hash) continue; if (context_equal(&entry->context, context)) { sid = entry->sid; break; } } rcu_read_unlock(); return sid; } int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context) { struct sidtab_isid_entry *isid; u32 hash; int rc; if (sid == 0 || sid > SECINITSID_NUM) return -EINVAL; isid = &s->isids[sid - 1]; rc = context_cpy(&isid->entry.context, context); if (rc) return rc; #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 isid->entry.cache = NULL; #endif isid->set = 1; hash = context_compute_hash(context); /* * Multiple initial sids may map to the same context. Check that this * context is not already represented in the context_to_sid hashtable * to avoid duplicate entries and long linked lists upon hash * collision. */ if (!context_to_sid(s, context, hash)) { isid->entry.sid = sid; isid->entry.hash = hash; hash_add(s->context_to_sid, &isid->entry.list, hash); } return 0; } int sidtab_hash_stats(struct sidtab *sidtab, char *page) { unsigned int i; int chain_len = 0; int slots_used = 0; int entries = 0; int max_chain_len = 0; unsigned int cur_bucket = 0; struct sidtab_entry *entry; rcu_read_lock(); hash_for_each_rcu(sidtab->context_to_sid, i, entry, list) { entries++; if (i == cur_bucket) { chain_len++; if (chain_len == 1) slots_used++; } else { cur_bucket = i; if (chain_len > max_chain_len) max_chain_len = chain_len; chain_len = 0; } } rcu_read_unlock(); if (chain_len > max_chain_len) max_chain_len = chain_len; return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n" "longest chain: %d\n", entries, slots_used, SIDTAB_HASH_BUCKETS, max_chain_len); } static u32 sidtab_level_from_count(u32 count) { u32 capacity = SIDTAB_LEAF_ENTRIES; u32 level = 0; while (count > capacity) { capacity <<= SIDTAB_INNER_SHIFT; ++level; } return level; } static int sidtab_alloc_roots(struct sidtab *s, u32 level) { u32 l; if (!s->roots[0].ptr_leaf) { s->roots[0].ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!s->roots[0].ptr_leaf) return -ENOMEM; } for (l = 1; l <= level; ++l) if (!s->roots[l].ptr_inner) { s->roots[l].ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!s->roots[l].ptr_inner) return -ENOMEM; s->roots[l].ptr_inner->entries[0] = s->roots[l - 1]; } return 0; } static struct sidtab_entry *sidtab_do_lookup(struct sidtab *s, u32 index, int alloc) { union sidtab_entry_inner *entry; u32 level, capacity_shift, leaf_index = index / SIDTAB_LEAF_ENTRIES; /* find the level of the subtree we need */ level = sidtab_level_from_count(index + 1); capacity_shift = level * SIDTAB_INNER_SHIFT; /* allocate roots if needed */ if (alloc && sidtab_alloc_roots(s, level) != 0) return NULL; /* lookup inside the subtree */ entry = &s->roots[level]; while (level != 0) { capacity_shift -= SIDTAB_INNER_SHIFT; --level; entry = &entry->ptr_inner->entries[leaf_index >> capacity_shift]; leaf_index &= ((u32)1 << capacity_shift) - 1; if (!entry->ptr_inner) { if (alloc) entry->ptr_inner = kzalloc( SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!entry->ptr_inner) return NULL; } } if (!entry->ptr_leaf) { if (alloc) entry->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC); if (!entry->ptr_leaf) return NULL; } return &entry->ptr_leaf->entries[index % SIDTAB_LEAF_ENTRIES]; } static struct sidtab_entry *sidtab_lookup(struct sidtab *s, u32 index) { /* read entries only after reading count */ u32 count = smp_load_acquire(&s->count); if (index >= count) return NULL; return sidtab_do_lookup(s, index, 0); } static struct sidtab_entry *sidtab_lookup_initial(struct sidtab *s, u32 sid) { return s->isids[sid - 1].set ? &s->isids[sid - 1].entry : NULL; } static struct sidtab_entry *sidtab_search_core(struct sidtab *s, u32 sid, int force) { if (sid != 0) { struct sidtab_entry *entry; if (sid > SECINITSID_NUM) entry = sidtab_lookup(s, sid_to_index(sid)); else entry = sidtab_lookup_initial(s, sid); if (entry && (!entry->context.len || force)) return entry; } return sidtab_lookup_initial(s, SECINITSID_UNLABELED); } struct sidtab_entry *sidtab_search_entry(struct sidtab *s, u32 sid) { return sidtab_search_core(s, sid, 0); } struct sidtab_entry *sidtab_search_entry_force(struct sidtab *s, u32 sid) { return sidtab_search_core(s, sid, 1); } int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid) { unsigned long flags; u32 count, hash = context_compute_hash(context); struct sidtab_convert_params *convert; struct sidtab_entry *dst, *dst_convert; int rc; *sid = context_to_sid(s, context, hash); if (*sid) return 0; /* lock-free search failed: lock, re-search, and insert if not found */ spin_lock_irqsave(&s->lock, flags); rc = 0; *sid = context_to_sid(s, context, hash); if (*sid) goto out_unlock; if (unlikely(s->frozen)) { /* * This sidtab is now frozen - tell the caller to abort and * get the new one. */ rc = -ESTALE; goto out_unlock; } count = s->count; /* bail out if we already reached max entries */ rc = -EOVERFLOW; if (count >= SIDTAB_MAX) goto out_unlock; /* insert context into new entry */ rc = -ENOMEM; dst = sidtab_do_lookup(s, count, 1); if (!dst) goto out_unlock; dst->sid = index_to_sid(count); dst->hash = hash; rc = context_cpy(&dst->context, context); if (rc) goto out_unlock; /* * if we are building a new sidtab, we need to convert the context * and insert it there as well */ convert = s->convert; if (convert) { struct sidtab *target = convert->target; rc = -ENOMEM; dst_convert = sidtab_do_lookup(target, count, 1); if (!dst_convert) { context_destroy(&dst->context); goto out_unlock; } rc = services_convert_context(convert->args, context, &dst_convert->context, GFP_ATOMIC); if (rc) { context_destroy(&dst->context); goto out_unlock; } dst_convert->sid = index_to_sid(count); dst_convert->hash = context_compute_hash(&dst_convert->context); target->count = count + 1; hash_add_rcu(target->context_to_sid, &dst_convert->list, dst_convert->hash); } if (context->len) pr_info("SELinux: Context %s is not valid (left unmapped).\n", context->str); *sid = index_to_sid(count); /* write entries before updating count */ smp_store_release(&s->count, count + 1); hash_add_rcu(s->context_to_sid, &dst->list, dst->hash); rc = 0; out_unlock: spin_unlock_irqrestore(&s->lock, flags); return rc; } static void sidtab_convert_hashtable(struct sidtab *s, u32 count) { struct sidtab_entry *entry; u32 i; for (i = 0; i < count; i++) { entry = sidtab_do_lookup(s, i, 0); entry->sid = index_to_sid(i); entry->hash = context_compute_hash(&entry->context); hash_add_rcu(s->context_to_sid, &entry->list, entry->hash); } } static int sidtab_convert_tree(union sidtab_entry_inner *edst, union sidtab_entry_inner *esrc, u32 *pos, u32 count, u32 level, struct sidtab_convert_params *convert) { int rc; u32 i; if (level != 0) { if (!edst->ptr_inner) { edst->ptr_inner = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_KERNEL); if (!edst->ptr_inner) return -ENOMEM; } i = 0; while (i < SIDTAB_INNER_ENTRIES && *pos < count) { rc = sidtab_convert_tree(&edst->ptr_inner->entries[i], &esrc->ptr_inner->entries[i], pos, count, level - 1, convert); if (rc) return rc; i++; } } else { if (!edst->ptr_leaf) { edst->ptr_leaf = kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_KERNEL); if (!edst->ptr_leaf) return -ENOMEM; } i = 0; while (i < SIDTAB_LEAF_ENTRIES && *pos < count) { rc = services_convert_context( convert->args, &esrc->ptr_leaf->entries[i].context, &edst->ptr_leaf->entries[i].context, GFP_KERNEL); if (rc) return rc; (*pos)++; i++; } cond_resched(); } return 0; } int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params) { unsigned long flags; u32 count, level, pos; int rc; spin_lock_irqsave(&s->lock, flags); /* concurrent policy loads are not allowed */ if (s->convert) { spin_unlock_irqrestore(&s->lock, flags); return -EBUSY; } count = s->count; level = sidtab_level_from_count(count); /* allocate last leaf in the new sidtab (to avoid race with * live convert) */ rc = sidtab_do_lookup(params->target, count - 1, 1) ? 0 : -ENOMEM; if (rc) { spin_unlock_irqrestore(&s->lock, flags); return rc; } /* set count in case no new entries are added during conversion */ params->target->count = count; /* enable live convert of new entries */ s->convert = params; /* we can safely convert the tree outside the lock */ spin_unlock_irqrestore(&s->lock, flags); pr_info("SELinux: Converting %u SID table entries...\n", count); /* convert all entries not covered by live convert */ pos = 0; rc = sidtab_convert_tree(&params->target->roots[level], &s->roots[level], &pos, count, level, params); if (rc) { /* we need to keep the old table - disable live convert */ spin_lock_irqsave(&s->lock, flags); s->convert = NULL; spin_unlock_irqrestore(&s->lock, flags); return rc; } /* * The hashtable can also be modified in sidtab_context_to_sid() * so we must re-acquire the lock here. */ spin_lock_irqsave(&s->lock, flags); sidtab_convert_hashtable(params->target, count); spin_unlock_irqrestore(&s->lock, flags); return 0; } void sidtab_cancel_convert(struct sidtab *s) { unsigned long flags; /* cancelling policy load - disable live convert of sidtab */ spin_lock_irqsave(&s->lock, flags); s->convert = NULL; spin_unlock_irqrestore(&s->lock, flags); } void sidtab_freeze_begin(struct sidtab *s, unsigned long *flags) __acquires(&s->lock) { spin_lock_irqsave(&s->lock, *flags); s->frozen = true; s->convert = NULL; } void sidtab_freeze_end(struct sidtab *s, unsigned long *flags) __releases(&s->lock) { spin_unlock_irqrestore(&s->lock, *flags); } static void sidtab_destroy_entry(struct sidtab_entry *entry) { context_destroy(&entry->context); #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 kfree(rcu_dereference_raw(entry->cache)); #endif } static void sidtab_destroy_tree(union sidtab_entry_inner entry, u32 level) { u32 i; if (level != 0) { struct sidtab_node_inner *node = entry.ptr_inner; if (!node) return; for (i = 0; i < SIDTAB_INNER_ENTRIES; i++) sidtab_destroy_tree(node->entries[i], level - 1); kfree(node); } else { struct sidtab_node_leaf *node = entry.ptr_leaf; if (!node) return; for (i = 0; i < SIDTAB_LEAF_ENTRIES; i++) sidtab_destroy_entry(&node->entries[i]); kfree(node); } } void sidtab_destroy(struct sidtab *s) { u32 i, level; for (i = 0; i < SECINITSID_NUM; i++) if (s->isids[i].set) sidtab_destroy_entry(&s->isids[i].entry); level = SIDTAB_MAX_LEVEL; while (level && !s->roots[level].ptr_inner) --level; sidtab_destroy_tree(s->roots[level], level); /* * The context_to_sid hashtable's objects are all shared * with the isids array and context tree, and so don't need * to be cleaned up here. */ } #if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry, const char *str, u32 str_len) { struct sidtab_str_cache *cache, *victim = NULL; unsigned long flags; /* do not cache invalid contexts */ if (entry->context.len) return; spin_lock_irqsave(&s->cache_lock, flags); cache = rcu_dereference_protected(entry->cache, lockdep_is_held(&s->cache_lock)); if (cache) { /* entry in cache - just bump to the head of LRU list */ list_move(&cache->lru_member, &s->cache_lru_list); goto out_unlock; } cache = kmalloc(struct_size(cache, str, str_len), GFP_ATOMIC); if (!cache) goto out_unlock; if (s->cache_free_slots == 0) { /* pop a cache entry from the tail and free it */ victim = container_of(s->cache_lru_list.prev, struct sidtab_str_cache, lru_member); list_del(&victim->lru_member); rcu_assign_pointer(victim->parent->cache, NULL); } else { s->cache_free_slots--; } cache->parent = entry; cache->len = str_len; memcpy(cache->str, str, str_len); list_add(&cache->lru_member, &s->cache_lru_list); rcu_assign_pointer(entry->cache, cache); out_unlock: spin_unlock_irqrestore(&s->cache_lock, flags); kfree_rcu(victim, rcu_member); } int sidtab_sid2str_get(struct sidtab *s, struct sidtab_entry *entry, char **out, u32 *out_len) { struct sidtab_str_cache *cache; int rc = 0; if (entry->context.len) return -ENOENT; /* do not cache invalid contexts */ rcu_read_lock(); cache = rcu_dereference(entry->cache); if (!cache) { rc = -ENOENT; } else { *out_len = cache->len; if (out) { *out = kmemdup(cache->str, cache->len, GFP_ATOMIC); if (!*out) rc = -ENOMEM; } } rcu_read_unlock(); if (!rc && out) sidtab_sid2str_put(s, entry, *out, *out_len); return rc; } #endif /* CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 */
1 256 256 345 256 263 253 254 345 319 2 344 30 229 345 345 344 345 263 262 257 17 345 344 345 325 345 344 345 345 344 345 2 2 2 2 265 265 265 262 90 131 131 257 256 257 1 257 257 257 257 256 256 257 13 13 262 230 257 256 257 254 2 257 257 257 257 229 231 256 253 251 257 231 257 256 257 256 55 53 14 14 13 13 13 13 13 13 54 55 101 89 1 13 13 12 90 12 12 12 1 1 1 264 265 26 26 3 3 26 26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 // SPDX-License-Identifier: GPL-2.0-only /* * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. * No bombay mix was harmed in the writing of this file. * * Copyright (C) 2020 Google LLC * Author: Will Deacon <will@kernel.org> */ #include <linux/bitfield.h> #include <asm/kvm_pgtable.h> #include <asm/stage2_pgtable.h> struct kvm_pgtable_walk_data { struct kvm_pgtable_walker *walker; const u64 start; u64 addr; const u64 end; }; static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) { return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); } static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) { return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); } static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) { u64 granule = kvm_granule_size(ctx->level); if (!kvm_level_supports_block_mapping(ctx->level)) return false; if (granule > (ctx->end - ctx->addr)) return false; if (!IS_ALIGNED(phys, granule)) return false; return IS_ALIGNED(ctx->addr, granule); } static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) { u64 shift = kvm_granule_shift(level); u64 mask = BIT(PAGE_SHIFT - 3) - 1; return (data->addr >> shift) & mask; } static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) { u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ u64 mask = BIT(pgt->ia_bits) - 1; return (addr & mask) >> shift; } static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) { struct kvm_pgtable pgt = { .ia_bits = ia_bits, .start_level = start_level, }; return kvm_pgd_page_idx(&pgt, -1ULL) + 1; } static bool kvm_pte_table(kvm_pte_t pte, s8 level) { if (level == KVM_PGTABLE_LAST_LEVEL) return false; if (!kvm_pte_valid(pte)) return false; return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) { return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); } static void kvm_clear_pte(kvm_pte_t *ptep) { WRITE_ONCE(*ptep, 0); } static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) { kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); pte |= KVM_PTE_VALID; return pte; } static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) { kvm_pte_t pte = kvm_phys_to_pte(pa); u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : KVM_PTE_TYPE_BLOCK; pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); pte |= FIELD_PREP(KVM_PTE_TYPE, type); pte |= KVM_PTE_VALID; return pte; } static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) { return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); } static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_walker *walker = data->walker; /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); return walker->cb(ctx, visit); } static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, int r) { /* * Visitor callbacks return EAGAIN when the conditions that led to a * fault are no longer reflected in the page tables due to a race to * update a PTE. In the context of a fault handler this is interpreted * as a signal to retry guest execution. * * Ignore the return code altogether for walkers outside a fault handler * (e.g. write protecting a range of memory) and chug along with the * page table walk. */ if (r == -EAGAIN) return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); return !r; } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pteref, s8 level) { enum kvm_pgtable_walk_flags flags = data->walker->flags; kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); struct kvm_pgtable_visit_ctx ctx = { .ptep = ptep, .old = READ_ONCE(*ptep), .arg = data->walker->arg, .mm_ops = mm_ops, .start = data->start, .addr = data->addr, .end = data->end, .level = level, .flags = flags, }; int ret = 0; bool reload = false; kvm_pteref_t childp; bool table = kvm_pte_table(ctx.old, level); if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); reload = true; } if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); reload = true; } /* * Reload the page table after invoking the walker callback for leaf * entries or after pre-order traversal, to allow the walker to descend * into a newly installed or replaced table. */ if (reload) { ctx.old = READ_ONCE(*ptep); table = kvm_pte_table(ctx.old, level); } if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (!table) { data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); data->addr += kvm_granule_size(level); goto out; } childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); if (!kvm_pgtable_walk_continue(data->walker, ret)) goto out; if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); out: if (kvm_pgtable_walk_continue(data->walker, ret)) return 0; return ret; } static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) { u32 idx; int ret = 0; if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || level > KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { kvm_pteref_t pteref = &pgtable[idx]; if (data->addr >= data->end) break; ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); if (ret) break; } return ret; } static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) { u32 idx; int ret = 0; u64 limit = BIT(pgt->ia_bits); if (data->addr > limit || data->end > limit) return -ERANGE; if (!pgt->pgd) return -EINVAL; for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); if (ret) break; } return ret; } int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker) { struct kvm_pgtable_walk_data walk_data = { .start = ALIGN_DOWN(addr, PAGE_SIZE), .addr = ALIGN_DOWN(addr, PAGE_SIZE), .end = PAGE_ALIGN(walk_data.addr + size), .walker = walker, }; int r; r = kvm_pgtable_walk_begin(walker); if (r) return r; r = _kvm_pgtable_walk(pgt, &walk_data); kvm_pgtable_walk_end(walker); return r; } struct leaf_walk_data { kvm_pte_t pte; s8 level; }; static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct leaf_walk_data *data = ctx->arg; data->pte = ctx->old; data->level = ctx->level; return 0; } int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, kvm_pte_t *ptep, s8 *level) { struct leaf_walk_data data; struct kvm_pgtable_walker walker = { .cb = leaf_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = &data, }; int ret; ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), PAGE_SIZE, &walker); if (!ret) { if (ptep) *ptep = data.pte; if (level) *level = data.level; } return ret; } struct hyp_map_data { const u64 phys; kvm_pte_t attr; }; static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { bool device = prot & KVM_PGTABLE_PROT_DEVICE; u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; if (!(prot & KVM_PGTABLE_PROT_R)) return -EINVAL; if (prot & KVM_PGTABLE_PROT_X) { if (prot & KVM_PGTABLE_PROT_W) return -EINVAL; if (device) return -EINVAL; if (system_supports_bti_kernel()) attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; } else { attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; } attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); if (!kvm_lpa2_is_enabled()) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) { enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; u32 ap; if (!kvm_pte_valid(pte)) return prot; if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) prot |= KVM_PGTABLE_PROT_X; ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) prot |= KVM_PGTABLE_PROT_R; else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) prot |= KVM_PGTABLE_PROT_RW; return prot; } static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct hyp_map_data *data) { u64 phys = data->phys + (ctx->addr - ctx->start); kvm_pte_t new; if (!kvm_block_mapping_supported(ctx, phys)) return false; new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); if (ctx->old == new) return true; if (!kvm_pte_valid(ctx->old)) ctx->mm_ops->get_page(ctx->ptep); else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) return false; smp_store_release(ctx->ptep, new); return true; } static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t *childp, new; struct hyp_map_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (hyp_map_walker_try_leaf(ctx, data)) return 0; if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); if (!childp) return -ENOMEM; new = kvm_init_table_pte(childp, mm_ops); mm_ops->get_page(ctx->ptep); smp_store_release(ctx->ptep, new); return 0; } int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot) { int ret; struct hyp_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), }; struct kvm_pgtable_walker walker = { .cb = hyp_map_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; ret = hyp_set_prot_attr(prot, &map_data.attr); if (ret) return ret; ret = kvm_pgtable_walk(pgt, addr, size, &walker); dsb(ishst); isb(); return ret; } static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t *childp = NULL; u64 granule = kvm_granule_size(ctx->level); u64 *unmapped = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return -EINVAL; if (kvm_pte_table(ctx->old, ctx->level)) { childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; kvm_clear_pte(ctx->ptep); dsb(ishst); __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); } else { if (ctx->end - ctx->addr < granule) return -EINVAL; kvm_clear_pte(ctx->ptep); dsb(ishst); __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); *unmapped += granule; } dsb(ish); isb(); mm_ops->put_page(ctx->ptep); if (childp) mm_ops->put_page(childp); return 0; } u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { u64 unmapped = 0; struct kvm_pgtable_walker walker = { .cb = hyp_unmap_walker, .arg = &unmapped, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; if (!pgt->mm_ops->page_count) return 0; kvm_pgtable_walk(pgt, addr, size, &walker); return unmapped; } int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, struct kvm_pgtable_mm_ops *mm_ops) { s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - ARM64_HW_PGTABLE_LEVELS(va_bits); if (start_level < KVM_PGTABLE_FIRST_LEVEL || start_level > KVM_PGTABLE_LAST_LEVEL) return -EINVAL; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = va_bits; pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = NULL; pgt->force_pte_cb = NULL; return 0; } static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return 0; mm_ops->put_page(ctx->ptep); if (kvm_pte_table(ctx->old, ctx->level)) mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) { struct kvm_pgtable_walker walker = { .cb = hyp_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); pgt->pgd = NULL; } struct stage2_map_data { const u64 phys; kvm_pte_t attr; u8 owner_id; kvm_pte_t *anchor; kvm_pte_t *childp; struct kvm_s2_mmu *mmu; void *memcache; /* Force mappings to page granularity */ bool force_pte; /* Walk should update owner_id only */ bool annotation; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) { u64 vtcr = VTCR_EL2_FLAGS; s8 lvls; vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; vtcr |= VTCR_EL2_T0SZ(phys_shift); /* * Use a minimum 2 level page table to prevent splitting * host PMD huge pages at stage2. */ lvls = stage2_pgtable_levels(phys_shift); if (lvls < 2) lvls = 2; /* * When LPA2 is enabled, the HW supports an extra level of translation * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 * to as an addition to SL0 to enable encoding this extra start level. * However, since we always use concatenated pages for the first level * lookup, we will never need this extra level and therefore do not need * to touch SL2. */ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); #ifdef CONFIG_ARM64_HW_AFDBM /* * Enable the Hardware Access Flag management, unconditionally * on all CPUs. In systems that have asymmetric support for the feature * this allows KVM to leverage hardware support on the subset of cores * that implement the feature. * * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by * hardware) on implementations that do not advertise support for the * feature. As such, setting HA unconditionally is safe, unless you * happen to be running on a design that has unadvertised support for * HAFDBS. Here be dragons. */ if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) vtcr |= VTCR_EL2_HA; #endif /* CONFIG_ARM64_HW_AFDBM */ if (kvm_lpa2_is_enabled()) vtcr |= VTCR_EL2_DS; /* Set the vmid bits */ vtcr |= (get_vmid_bits(mmfr1) == 16) ? VTCR_EL2_VS_16BIT : VTCR_EL2_VS_8BIT; return vtcr; } static bool stage2_has_fwb(struct kvm_pgtable *pgt) { if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) return false; return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); } void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, size_t size) { unsigned long pages, inval_pages; if (!system_supports_tlb_range()) { kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); return; } pages = size >> PAGE_SHIFT; while (pages > 0) { inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); addr += inval_pages << PAGE_SHIFT; pages -= inval_pages; } } #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { kvm_pte_t attr; u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; switch (prot & (KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC)) { case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: return -EINVAL; case KVM_PGTABLE_PROT_DEVICE: if (prot & KVM_PGTABLE_PROT_X) return -EINVAL; attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); break; case KVM_PGTABLE_PROT_NORMAL_NC: if (prot & KVM_PGTABLE_PROT_X) return -EINVAL; attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); break; default: attr = KVM_S2_MEMATTR(pgt, NORMAL); } if (!(prot & KVM_PGTABLE_PROT_X)) attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; if (prot & KVM_PGTABLE_PROT_R) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; if (prot & KVM_PGTABLE_PROT_W) attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; if (!kvm_lpa2_is_enabled()) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) { enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; if (!kvm_pte_valid(pte)) return prot; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) prot |= KVM_PGTABLE_PROT_R; if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) prot |= KVM_PGTABLE_PROT_W; if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) prot |= KVM_PGTABLE_PROT_X; return prot; } static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) { if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) return true; return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); } static bool stage2_pte_is_counted(kvm_pte_t pte) { /* * The refcount tracks valid entries as well as invalid entries if they * encode ownership of a page to another entity than the page-table * owner, whose id is 0. */ return !!pte; } static bool stage2_pte_is_locked(kvm_pte_t pte) { return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); } static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) { if (!kvm_pgtable_walk_shared(ctx)) { WRITE_ONCE(*ctx->ptep, new); return true; } return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; } /** * stage2_try_break_pte() - Invalidates a pte according to the * 'break-before-make' requirements of the * architecture. * * @ctx: context of the visited pte. * @mmu: stage-2 mmu * * Returns: true if the pte was successfully broken. * * If the removed pte was valid, performs the necessary serialization and TLB * invalidation for the old value. For counted ptes, drops the reference count * on the containing table page. */ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (stage2_pte_is_locked(ctx->old)) { /* * Should never occur if this walker has exclusive access to the * page tables. */ WARN_ON(!kvm_pgtable_walk_shared(ctx)); return false; } if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) return false; if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { /* * Perform the appropriate TLB invalidation based on the * evicted pte value (if any). */ if (kvm_pte_table(ctx->old, ctx->level)) { u64 size = kvm_granule_size(ctx->level); u64 addr = ALIGN_DOWN(ctx->addr, size); kvm_tlb_flush_vmid_range(mmu, addr, size); } else if (kvm_pte_valid(ctx->old)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); } } if (stage2_pte_is_counted(ctx->old)) mm_ops->put_page(ctx->ptep); return true; } static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); if (stage2_pte_is_counted(new)) mm_ops->get_page(ctx->ptep); smp_store_release(ctx->ptep, new); } static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) { /* * If FEAT_TLBIRANGE is implemented, defer the individual * TLB invalidations until the entire walk is finished, and * then use the range-based TLBI instructions to do the * invalidations. Condition deferred TLB invalidation on the * system supporting FWB as the optimization is entirely * pointless when the unmap walker needs to perform CMOs. */ return system_supports_tlb_range() && stage2_has_fwb(pgt); } static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops) { struct kvm_pgtable *pgt = ctx->arg; /* * Clear the existing PTE, and perform break-before-make if it was * valid. Depending on the system support, defer the TLB maintenance * for the same until the entire unmap walk is completed. */ if (kvm_pte_valid(ctx->old)) { kvm_clear_pte(ctx->ptep); if (kvm_pte_table(ctx->old, ctx->level)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, TLBI_TTL_UNKNOWN); } else if (!stage2_unmap_defer_tlb_flush(pgt)) { kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); } } mm_ops->put_page(ctx->ptep); } static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) { u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL); } static bool stage2_pte_executable(kvm_pte_t pte) { return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, const struct stage2_map_data *data) { u64 phys = data->phys; /* Work out the correct PA based on how far the walk has gotten */ return phys + (ctx->addr - ctx->start); } static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { u64 phys = stage2_map_walker_phys_addr(ctx, data); if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) return false; if (data->annotation) return true; return kvm_block_mapping_supported(ctx, phys); } static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { kvm_pte_t new; u64 phys = stage2_map_walker_phys_addr(ctx, data); u64 granule = kvm_granule_size(ctx->level); struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!stage2_leaf_mapping_allowed(ctx, data)) return -E2BIG; if (!data->annotation) new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); else new = kvm_init_invalid_leaf_owner(data->owner_id); /* * Skip updating the PTE if we are trying to recreate the exact * same mapping or only change the access permissions. Instead, * the vCPU will exit one more time from guest if still needed * and then go through the path of relaxing permissions. */ if (!stage2_pte_needs_update(ctx->old, new)) return -EAGAIN; /* If we're only changing software bits, then store them and go! */ if (!kvm_pgtable_walk_shared(ctx) && !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) { bool old_is_counted = stage2_pte_is_counted(ctx->old); if (old_is_counted != stage2_pte_is_counted(new)) { if (old_is_counted) mm_ops->put_page(ctx->ptep); else mm_ops->get_page(ctx->ptep); } WARN_ON_ONCE(!stage2_try_set_pte(ctx, new)); return 0; } if (!stage2_try_break_pte(ctx, data->mmu)) return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), granule); if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); stage2_make_pte(ctx, new); return 0; } static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); int ret; if (!stage2_leaf_mapping_allowed(ctx, data)) return 0; ret = stage2_map_walker_try_leaf(ctx, data); if (ret) return ret; mm_ops->free_unlinked_table(childp, ctx->level); return 0; } static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, struct stage2_map_data *data) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp, new; int ret; ret = stage2_map_walker_try_leaf(ctx, data); if (ret != -E2BIG) return ret; if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) return -EINVAL; if (!data->memcache) return -ENOMEM; childp = mm_ops->zalloc_page(data->memcache); if (!childp) return -ENOMEM; if (!stage2_try_break_pte(ctx, data->mmu)) { mm_ops->put_page(childp); return -EAGAIN; } /* * If we've run into an existing block mapping then replace it with * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); return 0; } /* * The TABLE_PRE callback runs for table entries on the way down, looking * for table entries which we could conceivably replace with a block entry * for this mapping. If it finds one it replaces the entry and calls * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. * * Otherwise, the LEAF callback performs the mapping at the existing leaves * instead. */ static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct stage2_map_data *data = ctx->arg; switch (visit) { case KVM_PGTABLE_WALK_TABLE_PRE: return stage2_map_walk_table_pre(ctx, data); case KVM_PGTABLE_WALK_LEAF: return stage2_map_walk_leaf(ctx, data); default: return -EINVAL; } } int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, enum kvm_pgtable_prot prot, void *mc, enum kvm_pgtable_walk_flags flags) { int ret; struct stage2_map_data map_data = { .phys = ALIGN_DOWN(phys, PAGE_SIZE), .mmu = pgt->mmu, .memcache = mc, .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = flags | KVM_PGTABLE_WALK_TABLE_PRE | KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) return -EINVAL; ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); if (ret) return ret; ret = kvm_pgtable_walk(pgt, addr, size, &walker); dsb(ishst); return ret; } int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, void *mc, u8 owner_id) { int ret; struct stage2_map_data map_data = { .mmu = pgt->mmu, .memcache = mc, .owner_id = owner_id, .force_pte = true, .annotation = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = KVM_PGTABLE_WALK_TABLE_PRE | KVM_PGTABLE_WALK_LEAF, .arg = &map_data, }; if (owner_id > KVM_MAX_OWNER_ID) return -EINVAL; ret = kvm_pgtable_walk(pgt, addr, size, &walker); return ret; } static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable *pgt = ctx->arg; struct kvm_s2_mmu *mmu = pgt->mmu; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; kvm_pte_t *childp = NULL; bool need_flush = false; if (!kvm_pte_valid(ctx->old)) { if (stage2_pte_is_counted(ctx->old)) { kvm_clear_pte(ctx->ptep); mm_ops->put_page(ctx->ptep); } return 0; } if (kvm_pte_table(ctx->old, ctx->level)) { childp = kvm_pte_follow(ctx->old, mm_ops); if (mm_ops->page_count(childp) != 1) return 0; } else if (stage2_pte_cacheable(pgt, ctx->old)) { need_flush = !stage2_has_fwb(pgt); } /* * This is similar to the map() path in that we unmap the entire * block entry and rely on the remaining portions being faulted * back lazily. */ stage2_unmap_put_pte(ctx, mmu, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); if (childp) mm_ops->put_page(childp); return 0; } int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) { int ret; struct kvm_pgtable_walker walker = { .cb = stage2_unmap_walker, .arg = pgt, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; ret = kvm_pgtable_walk(pgt, addr, size, &walker); if (stage2_unmap_defer_tlb_flush(pgt)) /* Perform the deferred TLB invalidations */ kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); return ret; } struct stage2_attr_data { kvm_pte_t attr_set; kvm_pte_t attr_clr; kvm_pte_t pte; s8 level; }; static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t pte = ctx->old; struct stage2_attr_data *data = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!kvm_pte_valid(ctx->old)) return -EAGAIN; data->level = ctx->level; data->pte = pte; pte &= ~data->attr_clr; pte |= data->attr_set; /* * We may race with the CPU trying to set the access flag here, * but worst-case the access flag update gets lost and will be * set on the next access instead. */ if (data->pte != pte) { /* * Invalidate instruction cache before updating the guest * stage-2 PTE if we are going to add executable permission. */ if (mm_ops->icache_inval_pou && stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), kvm_granule_size(ctx->level)); if (!stage2_try_set_pte(ctx, pte)) return -EAGAIN; } return 0; } static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, u64 size, kvm_pte_t attr_set, kvm_pte_t attr_clr, kvm_pte_t *orig_pte, s8 *level, enum kvm_pgtable_walk_flags flags) { int ret; kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; struct stage2_attr_data data = { .attr_set = attr_set & attr_mask, .attr_clr = attr_clr & attr_mask, }; struct kvm_pgtable_walker walker = { .cb = stage2_attr_walker, .arg = &data, .flags = flags | KVM_PGTABLE_WALK_LEAF, }; ret = kvm_pgtable_walk(pgt, addr, size, &walker); if (ret) return ret; if (orig_pte) *orig_pte = data.pte; if (level) *level = data.level; return 0; } int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) { return stage2_update_leaf_attrs(pgt, addr, size, 0, KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, NULL, NULL, 0); } void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags) { int ret; ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, NULL, NULL, flags); if (!ret) dsb(ishst); } struct stage2_age_data { bool mkold; bool young; }; static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; struct stage2_age_data *data = ctx->arg; if (!kvm_pte_valid(ctx->old) || new == ctx->old) return 0; data->young = true; /* * stage2_age_walker() is always called while holding the MMU lock for * write, so this will always succeed. Nonetheless, this deliberately * follows the race detection pattern of the other stage-2 walkers in * case the locking mechanics of the MMU notifiers is ever changed. */ if (data->mkold && !stage2_try_set_pte(ctx, new)) return -EAGAIN; /* * "But where's the TLBI?!", you scream. * "Over in the core code", I sigh. * * See the '->clear_flush_young()' callback on the KVM mmu notifier. */ return 0; } bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) { struct stage2_age_data data = { .mkold = mkold, }; struct kvm_pgtable_walker walker = { .cb = stage2_age_walker, .arg = &data, .flags = KVM_PGTABLE_WALK_LEAF, }; WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); return data.young; } int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { int ret; s8 level; kvm_pte_t set = 0, clr = 0; if (prot & KVM_PTE_LEAF_ATTR_HI_SW) return -EINVAL; if (prot & KVM_PGTABLE_PROT_R) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; if (prot & KVM_PGTABLE_PROT_W) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable *pgt = ctx->arg; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; if (!stage2_pte_cacheable(pgt, ctx->old)) return 0; if (mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), kvm_granule_size(ctx->level)); return 0; } int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) { struct kvm_pgtable_walker walker = { .cb = stage2_flush_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = pgt, }; if (stage2_has_fwb(pgt)) return 0; return kvm_pgtable_walk(pgt, addr, size, &walker); } kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, enum kvm_pgtable_prot prot, void *mc, bool force_pte) { struct stage2_map_data map_data = { .phys = phys, .mmu = pgt->mmu, .memcache = mc, .force_pte = force_pte, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_SKIP_BBM_TLBI | KVM_PGTABLE_WALK_SKIP_CMO, .arg = &map_data, }; /* * The input address (.addr) is irrelevant for walking an * unlinked table. Construct an ambiguous IA range to map * kvm_granule_size(level) worth of memory. */ struct kvm_pgtable_walk_data data = { .walker = &walker, .addr = 0, .end = kvm_granule_size(level), }; struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; kvm_pte_t *pgtable; int ret; if (!IS_ALIGNED(phys, kvm_granule_size(level))) return ERR_PTR(-EINVAL); ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); if (ret) return ERR_PTR(ret); pgtable = mm_ops->zalloc_page(mc); if (!pgtable) return ERR_PTR(-ENOMEM); ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, level + 1); if (ret) { kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); return ERR_PTR(ret); } return pgtable; } /* * Get the number of page-tables needed to replace a block with a * fully populated tree up to the PTE entries. Note that @level is * interpreted as in "level @level entry". */ static int stage2_block_get_nr_page_tables(s8 level) { switch (level) { case 1: return PTRS_PER_PTE + 1; case 2: return 1; case 3: return 0; default: WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || level > KVM_PGTABLE_LAST_LEVEL); return -EINVAL; }; } static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; struct kvm_mmu_memory_cache *mc = ctx->arg; struct kvm_s2_mmu *mmu; kvm_pte_t pte = ctx->old, new, *childp; enum kvm_pgtable_prot prot; s8 level = ctx->level; bool force_pte; int nr_pages; u64 phys; /* No huge-pages exist at the last level */ if (level == KVM_PGTABLE_LAST_LEVEL) return 0; /* We only split valid block mappings */ if (!kvm_pte_valid(pte)) return 0; nr_pages = stage2_block_get_nr_page_tables(level); if (nr_pages < 0) return nr_pages; if (mc->nobjs >= nr_pages) { /* Build a tree mapped down to the PTE granularity. */ force_pte = true; } else { /* * Don't force PTEs, so create_unlinked() below does * not populate the tree up to the PTE level. The * consequence is that the call will require a single * page of level 2 entries at level 1, or a single * page of PTEs at level 2. If we are at level 1, the * PTEs will be created recursively. */ force_pte = false; nr_pages = 1; } if (mc->nobjs < nr_pages) return -ENOMEM; mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); phys = kvm_pte_to_phys(pte); prot = kvm_pgtable_stage2_pte_prot(pte); childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, level, prot, mc, force_pte); if (IS_ERR(childp)) return PTR_ERR(childp); if (!stage2_try_break_pte(ctx, mmu)) { kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); return -EAGAIN; } /* * Note, the contents of the page table are guaranteed to be made * visible before the new PTE is assigned because stage2_make_pte() * writes the PTE using smp_store_release(). */ new = kvm_init_table_pte(childp, mm_ops); stage2_make_pte(ctx, new); return 0; } int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc) { struct kvm_pgtable_walker walker = { .cb = stage2_split_walker, .flags = KVM_PGTABLE_WALK_LEAF, .arg = mc, }; int ret; ret = kvm_pgtable_walk(pgt, addr, size, &walker); dsb(ishst); return ret; } int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; u64 vtcr = mmu->vtcr; u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); if (!pgt->pgd) return -ENOMEM; pgt->ia_bits = ia_bits; pgt->start_level = start_level; pgt->mm_ops = mm_ops; pgt->mmu = mmu; pgt->flags = flags; pgt->force_pte_cb = force_pte_cb; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); return 0; } size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) { u32 ia_bits = VTCR_EL2_IPA(vtcr); u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; } static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; if (!stage2_pte_is_counted(ctx->old)) return 0; mm_ops->put_page(ctx->ptep); if (kvm_pte_table(ctx->old, ctx->level)) mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); return 0; } void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) { size_t pgd_sz; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); pgt->pgd = NULL; } void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { .cb = stage2_free_walker, .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, }; struct kvm_pgtable_walk_data data = { .walker = &walker, /* * At this point the IPA really doesn't matter, as the page * table being traversed has already been removed from the stage * 2. Set an appropriate range to cover the entire page table. */ .addr = 0, .end = kvm_granule_size(level), }; WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); WARN_ON(mm_ops->page_count(pgtable) != 1); mm_ops->put_page(pgtable); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 /* SPDX-License-Identifier: GPL-2.0 */ /* * This is <linux/capability.h> * * Andrew G. Morgan <morgan@kernel.org> * Alexander Kjeldaas <astor@guardian.no> * with help from Aleph1, Roland Buresund and Andrew Main. * * See here for the libcap library ("POSIX draft" compliance): * * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ */ #ifndef _LINUX_CAPABILITY_H #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> #include <linux/uidgid.h> #include <linux/bits.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 extern int file_caps_enabled; typedef struct { u64 val; } kernel_cap_t; /* same as vfs_ns_cap_data but in cpu endian and always filled completely */ struct cpu_vfs_cap_data { __u32 magic_etc; kuid_t rootid; kernel_cap_t permitted; kernel_cap_t inheritable; }; #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) struct file; struct inode; struct dentry; struct task_struct; struct user_namespace; struct mnt_idmap; /* * CAP_FS_MASK and CAP_NFSD_MASKS: * * The fs mask is all the privileges that fsuid==0 historically meant. * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE. * * It has never meant setting security.* and trusted.* xattrs. * * We could also define fsmask as follows: * 1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions * 2. The security.* and trusted.* xattrs are fs-related MAC permissions */ # define CAP_FS_MASK (BIT_ULL(CAP_CHOWN) \ | BIT_ULL(CAP_MKNOD) \ | BIT_ULL(CAP_DAC_OVERRIDE) \ | BIT_ULL(CAP_DAC_READ_SEARCH) \ | BIT_ULL(CAP_FOWNER) \ | BIT_ULL(CAP_FSETID) \ | BIT_ULL(CAP_MAC_OVERRIDE)) #define CAP_VALID_MASK (BIT_ULL(CAP_LAST_CAP+1)-1) # define CAP_EMPTY_SET ((kernel_cap_t) { 0 }) # define CAP_FULL_SET ((kernel_cap_t) { CAP_VALID_MASK }) # define CAP_FS_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) }) # define CAP_NFSD_SET ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) }) # define cap_clear(c) do { (c).val = 0; } while (0) #define cap_raise(c, flag) ((c).val |= BIT_ULL(flag)) #define cap_lower(c, flag) ((c).val &= ~BIT_ULL(flag)) #define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0) static inline kernel_cap_t cap_combine(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val | b.val }; } static inline kernel_cap_t cap_intersect(const kernel_cap_t a, const kernel_cap_t b) { return (kernel_cap_t) { a.val & b.val }; } static inline kernel_cap_t cap_drop(const kernel_cap_t a, const kernel_cap_t drop) { return (kernel_cap_t) { a.val &~ drop.val }; } static inline bool cap_isclear(const kernel_cap_t a) { return !a.val; } static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b) { return a.val == b.val; } /* * Check if "a" is a subset of "set". * return true if ALL of the capabilities in "a" are also in "set" * cap_issubset(0101, 1111) will return true * return false if ANY of the capabilities in "a" are not in "set" * cap_issubset(1111, 0101) will return false */ static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set) { return !(a.val & ~set.val); } /* Used to decide between falling back on the old suser() or fsuser(). */ static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a) { return cap_drop(a, CAP_FS_SET); } static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_FS_SET)); } static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a) { return cap_drop(a, CAP_NFSD_SET); } static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, const kernel_cap_t permitted) { return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET)); } #ifdef CONFIG_MULTIUSER extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); extern bool has_capability_noaudit(struct task_struct *t, int cap); extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); extern bool ns_capable_setid(struct user_namespace *ns, int cap); #else static inline bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool has_capability_noaudit(struct task_struct *t, int cap) { return true; } static inline bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { return true; } static inline bool capable(int cap) { return true; } static inline bool ns_capable(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return true; } static inline bool ns_capable_setid(struct user_namespace *ns, int cap) { return true; } #endif /* CONFIG_MULTIUSER */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode); bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); static inline bool perfmon_capable(void) { return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN); } static inline bool bpf_capable(void) { return capable(CAP_BPF) || capable(CAP_SYS_ADMIN); } static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) { return ns_capable(ns, CAP_CHECKPOINT_RESTORE) || ns_capable(ns, CAP_SYS_ADMIN); } /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry, const void **ivalue, size_t size); #endif /* !_LINUX_CAPABILITY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_NULLS_H #define _LINUX_LIST_NULLS_H #include <linux/poison.h> #include <linux/const.h> /* * Special version of lists, where end of list is not a NULL pointer, * but a 'nulls' marker, which can have many different values. * (up to 2^31 different values guaranteed on all platforms) * * In the standard hlist, termination of a list is the NULL pointer. * In this special 'nulls' variant, we use the fact that objects stored in * a list are aligned on a word (4 or 8 bytes alignment). * We therefore use the last significant bit of 'ptr' : * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1) * Set to 0 : This is a pointer to some object (ptr) */ struct hlist_nulls_head { struct hlist_nulls_node *first; }; struct hlist_nulls_node { struct hlist_nulls_node *next, **pprev; }; #define NULLS_MARKER(value) (1UL | (((long)value) << 1)) #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) #define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)} #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_nulls_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ }) /** * ptr_is_a_nulls - Test if a ptr is a nulls * @ptr: ptr to be tested * */ static inline int is_a_nulls(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr & 1); } /** * get_nulls_value - Get the 'nulls' value of the end of chain * @ptr: end of chain * * Should be called only if is_a_nulls(ptr); */ static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr) { return ((unsigned long)ptr) >> 1; } /** * hlist_nulls_unhashed - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. */ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h) { return !h->pprev; } /** * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized? * @h: Node to be checked * * Not that not all removal functions will leave a node in unhashed state. * For example, hlist_del_init_rcu() leaves the node in unhashed state, * but hlist_nulls_del() does not. Unlike hlist_nulls_unhashed(), this * function may be used locklessly. */ static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h) { return !READ_ONCE(h->pprev); } static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) { return is_a_nulls(READ_ONCE(h->first)); } static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, struct hlist_nulls_head *h) { struct hlist_nulls_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); h->first = n; if (!is_a_nulls(first)) WRITE_ONCE(first->pprev, &n->next); } static inline void __hlist_nulls_del(struct hlist_nulls_node *n) { struct hlist_nulls_node *next = n->next; struct hlist_nulls_node **pprev = n->pprev; WRITE_ONCE(*pprev, next); if (!is_a_nulls(next)) WRITE_ONCE(next->pprev, pprev); } static inline void hlist_nulls_del(struct hlist_nulls_node *n) { __hlist_nulls_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_nulls_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry(tpos, pos, head, member) \ for (pos = (head)->first; \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @member: the name of the hlist_node within the struct. * */ #define hlist_nulls_for_each_entry_from(tpos, pos, member) \ for (; (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) #endif
20 20 20 20 20 20 20 23 23 23 20 20 23 23 94 54 52 1 54 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 // SPDX-License-Identifier: GPL-2.0-only /* File: fs/xattr.c Extended attribute handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com> Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> #include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) { while (*a_prefix && *a == *a_prefix) { a++; a_prefix++; } return *a_prefix ? NULL : a; } /* * In order to implement different sets of xattr operations for each xattr * prefix, a filesystem should create a null-terminated array of struct * xattr_handler (one for each prefix) and hang a pointer to it off of the * s_xattr field of the superblock. */ #define for_each_xattr_handler(handlers, handler) \ if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) /* * Find the xattr_handler with the matching prefix. */ static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return ERR_PTR(-EIO); return ERR_PTR(-EOPNOTSUPP); } for_each_xattr_handler(handlers, handler) { const char *n; n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { if (!handler->prefix ^ !*n) { if (*n) continue; return ERR_PTR(-EINVAL); } *name = n; return handler; } } return ERR_PTR(-EOPNOTSUPP); } /** * may_write_xattr - check whether inode allows writing xattr * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never * set or remove an extended attribute on a read-only filesystem or on an * immutable / append-only inode. * * We also need to ensure that the inode has a mapping in the mount to * not risk writing back invalid i_{g,u}id values. * * Return: On success zero is returned. On error a negative errno is returned. */ int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. */ static int xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; ret = may_write_xattr(idmap, inode); if (ret) return ret; } /* * No restriction for security.* and system.* from the VFS. Decision * on these is left to the underlying filesystem / security module. */ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return 0; /* * The trusted.* namespace can only be accessed by privileged users. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { if (!capable(CAP_SYS_ADMIN)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; return 0; } /* * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(idmap, inode)) return -EPERM; } return inode_permission(idmap, inode, mask); } /* * Look for any handler that deals with the specified namespace. */ int xattr_supports_user_prefix(struct inode *inode) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return -EIO; return -EOPNOTSUPP; } for_each_xattr_handler(handlers, handler) { if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return 0; } return -EOPNOTSUPP; } EXPORT_SYMBOL(xattr_supports_user_prefix); int __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * * This function requires the caller to lock the inode's i_mutex before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, size, flags); } } else { if (unlikely(is_bad_inode(inode))) return -EIO; } if (error == -EAGAIN) { error = -EOPNOTSUPP; if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) fsnotify_xattr(dentry); } } return error; } /** * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: return error; } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; const void *orig_value = value; int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; } retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } if (value != orig_value) kfree(value); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; if (size < len) { len = -ERANGE; goto out; } memcpy(value, buffer, len); out: kfree(buffer); out_noalloc: return len; } /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, * before retrieving the extended attribute. The xattr value buffer should * always be freed by the caller, even on error. * * Returns the result of alloc, if failed, or the getxattr operation. */ int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; if (!value || (error > xattr_size)) { value = krealloc(*xattr_value, error + 1, flags); if (!value) return -ENOMEM; memset(value, 0, error + 1); } error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; return handler->get(handler, dentry, inode, name, value, size); } EXPORT_SYMBOL(__vfs_getxattr); ssize_t vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; error = security_inode_getxattr(dentry, name); if (error) return error; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ if (ret == -EOPNOTSUPP) goto nolsm; return ret; } nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); /** * vfs_listxattr - retrieve \0 separated list of xattr names * @dentry: the dentry from whose inode the xattr names are retrieved * @list: buffer to store xattr names into * @size: size of the buffer * * This function returns the names of all xattrs associated with the * inode of @dentry. * * Note, for legacy reasons the vfs_listxattr() function lists POSIX * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the * vfs_listxattr() function doesn't check for this flag since a * filesystem could implement POSIX ACLs without implementing any other * xattrs. * * However, since all codepaths that remove IOP_XATTR also assign of * inode operations that either don't implement or implement a stub * ->listxattr() operation. * * Return: On success, the size of the buffer that was used. On error a * negative error code. */ ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { struct inode *inode = d_inode(dentry); ssize_t error; error = security_inode_listxattr(dentry); if (error) return error; if (inode->i_op->listxattr) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } return error; } EXPORT_SYMBOL_GPL(vfs_listxattr); int __vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); /** * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_removexattr(idmap, dentry, name); if (error) return error; fsnotify_xattr(dentry); security_inode_post_removexattr(dentry, name); out: return error; } EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); int import_xattr_name(struct xattr_name *kname, const char __user *name) { int error = strncpy_from_user(kname->name, name, sizeof(kname->name)); if (error == 0 || error == sizeof(kname->name)) return -ERANGE; if (error < 0) return error; return 0; } /* * Extended attribute SET operations */ int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; error = import_xattr_name(ctx->kname, name); if (error) return error; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); if (IS_ERR(ctx->kvalue)) { error = PTR_ERR(ctx->kvalue); ctx->kvalue = NULL; } } return error; } static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct kernel_xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ int filename_setxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_setxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, const void __user *value, size_t size, int flags) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, .kname = &kname, .flags = flags, }; struct filename *filename; unsigned int lookup_flags = 0; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; error = setxattr_copy(name, &ctx); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) error = -EBADF; else error = file_setxattr(fd_file(f), &ctx); } else { error = filename_setxattr(dfd, filename, lookup_flags, &ctx); } kvfree(ctx.kvalue); return error; } SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, const struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; return path_setxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size, args.flags); } SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size, flags); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size, flags); } /* * Extended attribute GET operations */ static ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct kernel_xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; void *kvalue = NULL; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) ctx->size = XATTR_SIZE_MAX; kvalue = kvzalloc(ctx->size, GFP_KERNEL); if (!kvalue) return -ENOMEM; } if (is_posix_acl_xattr(kname)) error = do_get_acl(idmap, d, kname, kvalue, ctx->size); else error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(kvalue); return error; } ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx) { audit_file(f); return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); } /* unconditionally consumes filename */ ssize_t filename_getxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_getxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, void __user *value, size_t size) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .value = value, .size = size, .kname = &kname, .flags = 0, }; struct filename *filename; ssize_t error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_getxattr(fd_file(f), &ctx); } else { int lookup_flags = 0; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; return filename_getxattr(dfd, filename, lookup_flags, &ctx); } } SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; if (args.flags != 0) return -EINVAL; return path_getxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size); } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size); } /* * Extended attribute LIST operations */ static ssize_t listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kvmalloc(size, GFP_KERNEL); if (!klist) return -ENOMEM; } error = vfs_listxattr(d, klist, size); if (error > 0) { if (size && copy_to_user(list, klist, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { /* The file system tried to returned a list bigger than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(klist); return error; } static ssize_t file_listxattr(struct file *f, char __user *list, size_t size) { audit_file(f); return listxattr(f->f_path.dentry, list, size); } /* unconditionally consumes filename */ static ssize_t filename_listxattr(int dfd, struct filename *filename, unsigned int lookup_flags, char __user *list, size_t size) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_listxattrat(int dfd, const char __user *pathname, unsigned int at_flags, char __user *list, size_t size) { struct filename *filename; int lookup_flags; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_listxattr(fd_file(f), list, size); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_listxattr(dfd, filename, lookup_flags, list, size); } SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, char __user *, list, size_t, size) { return path_listxattrat(dfd, pathname, at_flags, list, size); } SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, 0, list, size); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size); } /* * Extended attribute REMOVE operations */ static long removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) { if (is_posix_acl_xattr(name)) return vfs_remove_acl(idmap, d, name); return vfs_removexattr(idmap, d, name); } static int file_removexattr(struct file *f, struct xattr_name *kname) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = removexattr(file_mnt_idmap(f), f->f_path.dentry, kname->name); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ static int filename_removexattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct xattr_name *kname) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_removexattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name) { struct xattr_name kname; struct filename *filename; unsigned int lookup_flags; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_removexattr(fd_file(f), &kname); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_removexattr(dfd, filename, lookup_flags, &kname); } SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name) { return path_removexattrat(dfd, pathname, at_flags, name); } SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, 0, name); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name); } int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) { size_t len; len = strlen(name) + 1; if (*buffer) { if (*remaining_size < len) return -ERANGE; memcpy(*buffer, name, len); *buffer += len; } *remaining_size -= len; return 0; } /** * generic_listxattr - run through a dentry's xattr list() operations * @dentry: dentry to list the xattrs * @buffer: result buffer * @buffer_size: size of @buffer * * Combine the results of the list() operation from every xattr_handler in the * xattr_handler stack. * * Note that this will not include the entries for POSIX ACLs. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; for_each_xattr_handler(handlers, handler) { int err; if (!handler->name || (handler->list && !handler->list(dentry))) continue; err = xattr_list_one(&buffer, &remaining_size, handler->name); if (err) return err; } return buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); /** * xattr_full_name - Compute full attribute name from suffix * * @handler: handler of the xattr_handler operation * @name: name passed to the xattr_handler operation * * The get and set xattr handler operations are called with the remainder of * the attribute name after skipping the handler's prefix: for example, "foo" * is passed to the get operation of a handler with prefix "user." to get * attribute "user.foo". The full name is still "there" in the name though. * * Note: the list xattr handler operation when called from the vfs is passed a * NULL name; some file systems use this operation internally, with varying * semantics. */ const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } EXPORT_SYMBOL(xattr_full_name); /** * simple_xattr_space - estimate the memory used by a simple xattr * @name: the full name of the xattr * @size: the size of its value * * This takes no account of how much larger the two slab objects actually are: * that would depend on the slab implementation, when what is required is a * deterministic number, which grows with name length and size and quantity. * * Return: The approximate number of bytes of memory used by such an xattr. */ size_t simple_xattr_space(const char *name, size_t size) { /* * Use "40" instead of sizeof(struct simple_xattr), to return the * same result on 32-bit and 64-bit, and even if simple_xattr grows. */ return 40 + size + strlen(name); } /** * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); kvfree(xattr); } /** * simple_xattr_alloc - allocate new xattr object * @value: value of the xattr object * @size: size of @value * * Allocate a new xattr object and initialize respective members. The caller is * responsible for handling the name of the xattr. * * Return: On success a new xattr object is returned. On failure NULL is * returned. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) return NULL; new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; new_xattr->size = size; memcpy(new_xattr->value, value, size); return new_xattr; } /** * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry * @key: xattr name * @node: current node * * Compare the xattr name with the xattr name attached to @node in the rbtree. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @node matches @key. */ static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) { const char *xattr_name = key; const struct simple_xattr *xattr; xattr = rb_entry(node, struct simple_xattr, rb_node); return strcmp(xattr->name, xattr_name); } /** * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @new_node matches the xattr attached to @node. */ static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, const struct rb_node *node) { struct simple_xattr *xattr; xattr = rb_entry(new_node, struct simple_xattr, rb_node); return rbtree_simple_xattr_cmp(xattr->name, node); } /** * simple_xattr_get - get an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @buffer: the buffer to store the value into * @size: the size of @buffer * * Try to find and retrieve the xattr object associated with @name. * If @buffer is provided store the value of @xattr in @buffer * otherwise just return the length. The size of @buffer is limited * to XATTR_SIZE_MAX which currently is 65536. * * Return: On success the length of the xattr value is returned. On error a * negative error code is returned. */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr = NULL; struct rb_node *rbp; int ret = -ENODATA; read_lock(&xattrs->lock); rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); if (rbp) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); ret = xattr->size; if (buffer) { if (size < xattr->size) ret = -ERANGE; else memcpy(buffer, xattr->value, xattr->size); } } read_unlock(&xattrs->lock); return ret; } /** * simple_xattr_set - set an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE * is specified in @flags a matching xattr object for @name must already exist. * If it does it will be replaced with the new xattr object. If it doesn't we * fail. If XATTR_CREATE is specified and a matching xattr does already exist * we fail. If it doesn't we create a new xattr. If @flags is zero we simply * insert the new xattr replacing any existing one. * * If @value is empty and a matching xattr object is found we delete it if * XATTR_REPLACE is specified in @flags or @flags is zero. * * If @value is empty and no matching xattr object for @name is found we do * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) return ERR_PTR(-ENOMEM); new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { simple_xattr_free(new_xattr); return ERR_PTR(-ENOMEM); } } write_lock(&xattrs->lock); rbp = &xattrs->rb_root.rb_node; while (*rbp) { parent = *rbp; ret = rbtree_simple_xattr_cmp(name, *rbp); if (ret < 0) rbp = &(*rbp)->rb_left; else if (ret > 0) rbp = &(*rbp)->rb_right; else old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); if (old_xattr) break; } if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; goto out_unlock; } if (new_xattr) rb_replace_node(&old_xattr->rb_node, &new_xattr->rb_node, &xattrs->rb_root); else rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { err = -ENODATA; goto out_unlock; } /* * If XATTR_CREATE or no flags are specified together with a * new value simply insert it. */ if (new_xattr) { rb_link_node(&new_xattr->rb_node, parent, rbp); rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); } /* * If XATTR_CREATE or no flags are specified and neither an * old or new xattr exist then we don't need to do anything. */ } out_unlock: write_unlock(&xattrs->lock); if (!err) return old_xattr; simple_xattr_free(new_xattr); return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } static bool xattr_is_maclabel(const char *name) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && security_ismaclabel(suffix); } /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs * @xattrs: the header of the xattr object * @buffer: the buffer to store all xattrs into * @size: the size of @buffer * * List all xattrs associated with @inode. If @buffer is NULL we returned * the required size of the buffer. If @buffer is provided we store the * xattrs value into it provided it is big enough. * * Note, the number of xattr names that can be listed with listxattr(2) is * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names * are found it will return -E2BIG. * * Return: On success the required size or the size of the copied xattrs is * returned. On error a negative error code is returned. */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; err = posix_acl_listxattr(inode, &buffer, &remaining_size); if (err) return err; err = security_inode_listsecurity(inode, buffer, remaining_size); if (err < 0) return err; if (buffer) { if (remaining_size < err) return -ERANGE; buffer += err; } remaining_size -= err; err = 0; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; /* skip MAC labels; these are provided by LSM above */ if (xattr_is_maclabel(xattr->name)) continue; err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; } read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } /** * rbtree_simple_xattr_less - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * Note that this function technically tolerates duplicate entries. * * Return: True if insertion point in the rbtree is found. */ static bool rbtree_simple_xattr_less(struct rb_node *new_node, const struct rb_node *node) { return rbtree_simple_xattr_node_cmp(new_node, node) < 0; } /** * simple_xattr_add - add xattr objects * @xattrs: the header of the xattr object * @new_xattr: the xattr object to add * * Add an xattr object to @xattrs. This assumes no replacement or removal * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. */ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { write_lock(&xattrs->lock); rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); write_unlock(&xattrs->lock); } /** * simple_xattrs_init - initialize new xattr header * @xattrs: header to initialize * * Initialize relevant fields of a an xattr header. */ void simple_xattrs_init(struct simple_xattrs *xattrs) { xattrs->rb_root = RB_ROOT; rwlock_init(&xattrs->lock); } /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; if (freed_space) *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; struct rb_node *rbp_next; rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); if (freed_space) *freed_space += simple_xattr_space(xattr->name, xattr->size); simple_xattr_free(xattr); rbp = rbp_next; } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ /* * Type definitions for the multi-level security (MLS) policy. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * Support for enhanced MLS infrastructure. * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. */ #ifndef _SS_MLS_TYPES_H_ #define _SS_MLS_TYPES_H_ #include "security.h" #include "ebitmap.h" struct mls_level { u32 sens; /* sensitivity */ struct ebitmap cat; /* category set */ }; struct mls_range { struct mls_level level[2]; /* low == level[0], high == level[1] */ }; static inline int mls_level_eq(const struct mls_level *l1, const struct mls_level *l2) { return ((l1->sens == l2->sens) && ebitmap_equal(&l1->cat, &l2->cat)); } static inline int mls_level_dom(const struct mls_level *l1, const struct mls_level *l2) { return ((l1->sens >= l2->sens) && ebitmap_contains(&l1->cat, &l2->cat, 0)); } #define mls_level_incomp(l1, l2) \ (!mls_level_dom((l1), (l2)) && !mls_level_dom((l2), (l1))) #define mls_level_between(l1, l2, l3) \ (mls_level_dom((l1), (l2)) && mls_level_dom((l3), (l1))) #define mls_range_contains(r1, r2) \ (mls_level_dom(&(r2).level[0], &(r1).level[0]) && \ mls_level_dom(&(r1).level[1], &(r2).level[1])) #endif /* _SS_MLS_TYPES_H_ */
23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org> * * Fixes from William Schumacher incorporated on 15 March 2001. * (Reported by Charles Bertsch, <CBertsch@microtest.com>). */ /* * This file contains generic functions for manipulating * POSIX 1003.1e draft standard 17 ACLs. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/atomic.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> #include <linux/export.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/mnt_idmapping.h> #include <linux/iversion.h> #include <linux/security.h> #include <linux/fsnotify.h> #include <linux/filelock.h> #include "internal.h" static struct posix_acl **acl_by_type(struct inode *inode, int type) { switch (type) { case ACL_TYPE_ACCESS: return &inode->i_acl; case ACL_TYPE_DEFAULT: return &inode->i_default_acl; default: BUG(); } } struct posix_acl *get_cached_acl(struct inode *inode, int type) { struct posix_acl **p = acl_by_type(inode, type); struct posix_acl *acl; for (;;) { rcu_read_lock(); acl = rcu_dereference(*p); if (!acl || is_uncached_acl(acl) || refcount_inc_not_zero(&acl->a_refcount)) break; rcu_read_unlock(); cpu_relax(); } rcu_read_unlock(); return acl; } EXPORT_SYMBOL(get_cached_acl); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) { struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type)); if (acl == ACL_DONT_CACHE) { struct posix_acl *ret; ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU); if (!IS_ERR(ret)) acl = ret; } return acl; } EXPORT_SYMBOL(get_cached_acl_rcu); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) { struct posix_acl **p = acl_by_type(inode, type); struct posix_acl *old; old = xchg(p, posix_acl_dup(acl)); if (!is_uncached_acl(old)) posix_acl_release(old); } EXPORT_SYMBOL(set_cached_acl); static void __forget_cached_acl(struct posix_acl **p) { struct posix_acl *old; old = xchg(p, ACL_NOT_CACHED); if (!is_uncached_acl(old)) posix_acl_release(old); } void forget_cached_acl(struct inode *inode, int type) { __forget_cached_acl(acl_by_type(inode, type)); } EXPORT_SYMBOL(forget_cached_acl); void forget_all_cached_acls(struct inode *inode) { __forget_cached_acl(&inode->i_acl); __forget_cached_acl(&inode->i_default_acl); } EXPORT_SYMBOL(forget_all_cached_acls); static struct posix_acl *__get_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, int type) { struct posix_acl *sentinel; struct posix_acl **p; struct posix_acl *acl; /* * The sentinel is used to detect when another operation like * set_cached_acl() or forget_cached_acl() races with get_inode_acl(). * It is guaranteed that is_uncached_acl(sentinel) is true. */ acl = get_cached_acl(inode, type); if (!is_uncached_acl(acl)) return acl; if (!IS_POSIXACL(inode)) return NULL; sentinel = uncached_acl_sentinel(current); p = acl_by_type(inode, type); /* * If the ACL isn't being read yet, set our sentinel. Otherwise, the * current value of the ACL will not be ACL_NOT_CACHED and so our own * sentinel will not be set; another task will update the cache. We * could wait for that other task to complete its job, but it's easier * to just call ->get_inode_acl to fetch the ACL ourself. (This is * going to be an unlikely race.) */ cmpxchg(p, ACL_NOT_CACHED, sentinel); /* * Normally, the ACL returned by ->get{_inode}_acl will be cached. * A filesystem can prevent that by calling * forget_cached_acl(inode, type) in ->get{_inode}_acl. * * If the filesystem doesn't have a get{_inode}_ acl() function at all, * we'll just create the negative cache entry. */ if (dentry && inode->i_op->get_acl) { acl = inode->i_op->get_acl(idmap, dentry, type); } else if (inode->i_op->get_inode_acl) { acl = inode->i_op->get_inode_acl(inode, type, false); } else { set_cached_acl(inode, type, NULL); return NULL; } if (IS_ERR(acl)) { /* * Remove our sentinel so that we don't block future attempts * to cache the ACL. */ cmpxchg(p, sentinel, ACL_NOT_CACHED); return acl; } /* * Cache the result, but only if our sentinel is still in place. */ posix_acl_dup(acl); if (unlikely(!try_cmpxchg(p, &sentinel, acl))) posix_acl_release(acl); return acl; } struct posix_acl *get_inode_acl(struct inode *inode, int type) { return __get_acl(&nop_mnt_idmap, NULL, inode, type); } EXPORT_SYMBOL(get_inode_acl); /* * Init a fresh posix_acl */ void posix_acl_init(struct posix_acl *acl, int count) { refcount_set(&acl->a_refcount, 1); acl->a_count = count; } EXPORT_SYMBOL(posix_acl_init); /* * Allocate a new ACL with the specified number of entries. */ struct posix_acl * posix_acl_alloc(unsigned int count, gfp_t flags) { struct posix_acl *acl; acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; } EXPORT_SYMBOL(posix_acl_alloc); /* * Clone an ACL. */ struct posix_acl * posix_acl_clone(const struct posix_acl *acl, gfp_t flags) { struct posix_acl *clone = NULL; if (acl) { clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), flags); if (clone) refcount_set(&clone->a_refcount, 1); } return clone; } EXPORT_SYMBOL_GPL(posix_acl_clone); /* * Check if an acl is valid. Returns 0 if it is, or -E... otherwise. */ int posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl) { const struct posix_acl_entry *pa, *pe; int state = ACL_USER_OBJ; int needs_mask = 0; FOREACH_ACL_ENTRY(pa, acl, pe) { if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) return -EINVAL; switch (pa->e_tag) { case ACL_USER_OBJ: if (state == ACL_USER_OBJ) { state = ACL_USER; break; } return -EINVAL; case ACL_USER: if (state != ACL_USER) return -EINVAL; if (!kuid_has_mapping(user_ns, pa->e_uid)) return -EINVAL; needs_mask = 1; break; case ACL_GROUP_OBJ: if (state == ACL_USER) { state = ACL_GROUP; break; } return -EINVAL; case ACL_GROUP: if (state != ACL_GROUP) return -EINVAL; if (!kgid_has_mapping(user_ns, pa->e_gid)) return -EINVAL; needs_mask = 1; break; case ACL_MASK: if (state != ACL_GROUP) return -EINVAL; state = ACL_OTHER; break; case ACL_OTHER: if (state == ACL_OTHER || (state == ACL_GROUP && !needs_mask)) { state = 0; break; } return -EINVAL; default: return -EINVAL; } } if (state == 0) return 0; return -EINVAL; } EXPORT_SYMBOL(posix_acl_valid); /* * Returns 0 if the acl can be exactly represented in the traditional * file mode permission bits, or else 1. Returns -E... on error. */ int posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) { const struct posix_acl_entry *pa, *pe; umode_t mode = 0; int not_equiv = 0; /* * A null ACL can always be presented as mode bits. */ if (!acl) return 0; FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: mode |= (pa->e_perm & S_IRWXO) << 6; break; case ACL_GROUP_OBJ: mode |= (pa->e_perm & S_IRWXO) << 3; break; case ACL_OTHER: mode |= pa->e_perm & S_IRWXO; break; case ACL_MASK: mode = (mode & ~S_IRWXG) | ((pa->e_perm & S_IRWXO) << 3); not_equiv = 1; break; case ACL_USER: case ACL_GROUP: not_equiv = 1; break; default: return -EINVAL; } } if (mode_p) *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } EXPORT_SYMBOL(posix_acl_equiv_mode); /* * Create an ACL representing the file mode permission bits of an inode. */ struct posix_acl * posix_acl_from_mode(umode_t mode, gfp_t flags) { struct posix_acl *acl = posix_acl_alloc(3, flags); if (!acl) return ERR_PTR(-ENOMEM); acl->a_entries[0].e_tag = ACL_USER_OBJ; acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6; acl->a_entries[1].e_tag = ACL_GROUP_OBJ; acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3; acl->a_entries[2].e_tag = ACL_OTHER; acl->a_entries[2].e_perm = (mode & S_IRWXO); return acl; } EXPORT_SYMBOL(posix_acl_from_mode); /* * Return 0 if current is granted want access to the inode * by the acl. Returns -E... otherwise. */ int posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; struct user_namespace *fs_userns = i_user_ns(inode); int found = 0; vfsuid_t vfsuid; vfsgid_t vfsgid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto check_perm; break; case ACL_USER: vfsuid = make_vfsuid(idmap, fs_userns, pa->e_uid); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: vfsgid = make_vfsgid(idmap, fs_userns, pa->e_gid); if (vfsgid_in_group_p(vfsgid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_MASK: break; case ACL_OTHER: if (found) return -EACCES; else goto check_perm; default: return -EIO; } } return -EIO; mask: for (mask_obj = pa+1; mask_obj != pe; mask_obj++) { if (mask_obj->e_tag == ACL_MASK) { if ((pa->e_perm & mask_obj->e_perm & want) == want) return 0; return -EACCES; } } check_perm: if ((pa->e_perm & want) == want) return 0; return -EACCES; } /* * Modify acl when creating a new inode. The caller must ensure the acl is * only referenced once. * * mode_p initially must contain the mode parameter to the open() / creat() * system calls. All permissions that are not granted by the acl are removed. * The permissions in the acl are changed to reflect the mode_p parameter. */ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) { struct posix_acl_entry *pa, *pe; struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; umode_t mode = *mode_p; int not_equiv = 0; /* assert(atomic_read(acl->a_refcount) == 1); */ FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: pa->e_perm &= (mode >> 6) | ~S_IRWXO; mode &= (pa->e_perm << 6) | ~S_IRWXU; break; case ACL_USER: case ACL_GROUP: not_equiv = 1; break; case ACL_GROUP_OBJ: group_obj = pa; break; case ACL_OTHER: pa->e_perm &= mode | ~S_IRWXO; mode &= pa->e_perm | ~S_IRWXO; break; case ACL_MASK: mask_obj = pa; not_equiv = 1; break; default: return -EIO; } } if (mask_obj) { mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; } else { if (!group_obj) return -EIO; group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; mode &= (group_obj->e_perm << 3) | ~S_IRWXG; } *mode_p = (*mode_p & ~S_IRWXUGO) | mode; return not_equiv; } /* * Modify the ACL for the chmod syscall. */ static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) { struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; struct posix_acl_entry *pa, *pe; /* assert(atomic_read(acl->a_refcount) == 1); */ FOREACH_ACL_ENTRY(pa, acl, pe) { switch(pa->e_tag) { case ACL_USER_OBJ: pa->e_perm = (mode & S_IRWXU) >> 6; break; case ACL_USER: case ACL_GROUP: break; case ACL_GROUP_OBJ: group_obj = pa; break; case ACL_MASK: mask_obj = pa; break; case ACL_OTHER: pa->e_perm = (mode & S_IRWXO); break; default: return -EIO; } } if (mask_obj) { mask_obj->e_perm = (mode & S_IRWXG) >> 3; } else { if (!group_obj) return -EIO; group_obj->e_perm = (mode & S_IRWXG) >> 3; } return 0; } int __posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { err = posix_acl_create_masq(clone, mode_p); if (err < 0) { posix_acl_release(clone); clone = NULL; } } posix_acl_release(*acl); *acl = clone; return err; } EXPORT_SYMBOL(__posix_acl_create); int __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) { struct posix_acl *clone = posix_acl_clone(*acl, gfp); int err = -ENOMEM; if (clone) { err = __posix_acl_chmod_masq(clone, mode); if (err) { posix_acl_release(clone); clone = NULL; } } posix_acl_release(*acl); *acl = clone; return err; } EXPORT_SYMBOL(__posix_acl_chmod); /** * posix_acl_chmod - chmod a posix acl * * @idmap: idmap of the mount @inode was found from * @dentry: dentry to check permissions on * @mode: the new mode of @inode * * If the dentry has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. */ int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { struct inode *inode = d_inode(dentry); struct posix_acl *acl; int ret = 0; if (!IS_POSIXACL(inode)) return 0; if (!inode->i_op->set_acl) return -EOPNOTSUPP; acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR_OR_NULL(acl)) { if (acl == ERR_PTR(-EOPNOTSUPP)) return 0; return PTR_ERR(acl); } ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } EXPORT_SYMBOL(posix_acl_chmod); int posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; struct posix_acl *clone; int ret; *acl = NULL; *default_acl = NULL; if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; p = get_inode_acl(dir, ACL_TYPE_DEFAULT); if (!p || p == ERR_PTR(-EOPNOTSUPP)) { *mode &= ~current_umask(); return 0; } if (IS_ERR(p)) return PTR_ERR(p); ret = -ENOMEM; clone = posix_acl_clone(p, GFP_NOFS); if (!clone) goto err_release; ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto err_release_clone; if (ret == 0) posix_acl_release(clone); else *acl = clone; if (!S_ISDIR(*mode)) posix_acl_release(p); else *default_acl = p; return 0; err_release_clone: posix_acl_release(clone); err_release: posix_acl_release(p); return ret; } EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl * @idmap: idmap of the mount @inode was found from * @inode: target inode * @mode_p: mode (pointer) for update * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new * file mode, set *@acl to NULL to indicate that no ACL should be set. * * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then * take care to map the inode according to @idmap before checking * permissions. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply pass @nop_mnt_idmap. * * Called from set_acl inode operations. */ int posix_acl_update_mode(struct mnt_idmap *idmap, struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { umode_t mode = inode->i_mode; int error; error = posix_acl_equiv_mode(*acl, &mode); if (error < 0) return error; if (error == 0) *acl = NULL; if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode))) mode &= ~S_ISGID; *mode_p = mode; return 0; } EXPORT_SYMBOL(posix_acl_update_mode); /* * Fix up the uids and gids in posix acl extended attributes in place. */ static int posix_acl_fix_xattr_common(const void *value, size_t size) { const struct posix_acl_xattr_header *header = value; int count; if (!header) return -EINVAL; if (size < sizeof(struct posix_acl_xattr_header)) return -EINVAL; if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) return -EOPNOTSUPP; count = posix_acl_xattr_count(size); if (count < 0) return -EINVAL; if (count == 0) return 0; return count; } /** * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format * @userns: the filesystem's idmapping * @value: the uapi representation of POSIX ACLs * @size: the size of @void * * Filesystems that store POSIX ACLs in the unaltered uapi format should use * posix_acl_from_xattr() when reading them from the backing store and * converting them into the struct posix_acl VFS format. The helper is * specifically intended to be called from the acl inode operation. * * The posix_acl_from_xattr() function will map the raw {g,u}id values stored * in ACL_{GROUP,USER} entries into idmapping in @userns. * * Note that posix_acl_from_xattr() does not take idmapped mounts into account. * If it did it calling it from the get acl inode operation would return POSIX * ACLs mapped according to an idmapped mount which would mean that the value * couldn't be cached for the filesystem. Idmapped mounts are taken into * account on the fly during permission checking or right at the VFS - * userspace boundary before reporting them to the user. * * Return: Allocated struct posix_acl on success, NULL for a valid header but * without actual POSIX ACL entries, or ERR_PTR() encoded error code. */ struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns, const void *value, size_t size) { const struct posix_acl_xattr_header *header = value; const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end; int count; struct posix_acl *acl; struct posix_acl_entry *acl_e; count = posix_acl_fix_xattr_common(value, size); if (count < 0) return ERR_PTR(count); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); acl_e = acl->a_entries; for (end = entry + count; entry != end; acl_e++, entry++) { acl_e->e_tag = le16_to_cpu(entry->e_tag); acl_e->e_perm = le16_to_cpu(entry->e_perm); switch(acl_e->e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: break; case ACL_USER: acl_e->e_uid = make_kuid(userns, le32_to_cpu(entry->e_id)); if (!uid_valid(acl_e->e_uid)) goto fail; break; case ACL_GROUP: acl_e->e_gid = make_kgid(userns, le32_to_cpu(entry->e_id)); if (!gid_valid(acl_e->e_gid)) goto fail; break; default: goto fail; } } return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } EXPORT_SYMBOL (posix_acl_from_xattr); /* * Convert from in-memory to extended attribute representation. */ int posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl, void *buffer, size_t size) { struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; int real_size, n; real_size = posix_acl_xattr_size(acl->a_count); if (!buffer) return real_size; if (real_size > size) return -ERANGE; ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); switch(acl_e->e_tag) { case ACL_USER: ext_entry->e_id = cpu_to_le32(from_kuid(user_ns, acl_e->e_uid)); break; case ACL_GROUP: ext_entry->e_id = cpu_to_le32(from_kgid(user_ns, acl_e->e_gid)); break; default: ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); break; } } return real_size; } EXPORT_SYMBOL (posix_acl_to_xattr); /** * vfs_posix_acl_to_xattr - convert from kernel to userspace representation * @idmap: idmap of the mount * @inode: inode the posix acls are set on * @acl: the posix acls as represented by the vfs * @buffer: the buffer into which to convert @acl * @size: size of @buffer * * This converts @acl from the VFS representation in the filesystem idmapping * to the uapi form reportable to userspace. And mount and caller idmappings * are handled appropriately. * * Return: On success, the size of the stored uapi posix acls, on error a * negative errno. */ static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap, struct inode *inode, const struct posix_acl *acl, void *buffer, size_t size) { struct posix_acl_xattr_header *ext_acl = buffer; struct posix_acl_xattr_entry *ext_entry; struct user_namespace *fs_userns, *caller_userns; ssize_t real_size, n; vfsuid_t vfsuid; vfsgid_t vfsgid; real_size = posix_acl_xattr_size(acl->a_count); if (!buffer) return real_size; if (real_size > size) return -ERANGE; ext_entry = (void *)(ext_acl + 1); ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); fs_userns = i_user_ns(inode); caller_userns = current_user_ns(); for (n=0; n < acl->a_count; n++, ext_entry++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext_entry->e_tag = cpu_to_le16(acl_e->e_tag); ext_entry->e_perm = cpu_to_le16(acl_e->e_perm); switch(acl_e->e_tag) { case ACL_USER: vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid); ext_entry->e_id = cpu_to_le32(from_kuid( caller_userns, vfsuid_into_kuid(vfsuid))); break; case ACL_GROUP: vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid); ext_entry->e_id = cpu_to_le32(from_kgid( caller_userns, vfsgid_into_kgid(vfsgid))); break; default: ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); break; } } return real_size; } int set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type, struct posix_acl *acl) { struct inode *inode = d_inode(dentry); if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) return -EOPNOTSUPP; if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (acl) { int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl); if (ret) return ret; } return inode->i_op->set_acl(idmap, dentry, acl, type); } EXPORT_SYMBOL(set_posix_acl); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { int err; if (!IS_POSIXACL(inode)) return 0; if (inode->i_acl) { err = xattr_list_one(buffer, remaining_size, XATTR_NAME_POSIX_ACL_ACCESS); if (err) return err; } if (inode->i_default_acl) { err = xattr_list_one(buffer, remaining_size, XATTR_NAME_POSIX_ACL_DEFAULT); if (err) return err; } return 0; } static bool posix_acl_xattr_list(struct dentry *dentry) { return IS_POSIXACL(d_backing_inode(dentry)); } /* * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs * * This is the legacy POSIX ACL access xattr handler. It is used by some * filesystems to implement their ->listxattr() inode operation. New code * should never use them. */ const struct xattr_handler nop_posix_acl_access = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .list = posix_acl_xattr_list, }; EXPORT_SYMBOL_GPL(nop_posix_acl_access); /* * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs * * This is the legacy POSIX ACL default xattr handler. It is used by some * filesystems to implement their ->listxattr() inode operation. New code * should never use them. */ const struct xattr_handler nop_posix_acl_default = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .list = posix_acl_xattr_list, }; EXPORT_SYMBOL_GPL(nop_posix_acl_default); int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { int error; struct inode *inode = d_inode(dentry); if (type == ACL_TYPE_ACCESS) { error = posix_acl_update_mode(idmap, inode, &inode->i_mode, &acl); if (error) return error; } inode_set_ctime_current(inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } int simple_acl_create(struct inode *dir, struct inode *inode) { struct posix_acl *default_acl, *acl; int error; error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl); set_cached_acl(inode, ACL_TYPE_ACCESS, acl); if (default_acl) posix_acl_release(default_acl); if (acl) posix_acl_release(acl); return 0; } static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap, struct user_namespace *fs_userns, struct posix_acl *acl) { for (int n = 0; n < acl->a_count; n++) { struct posix_acl_entry *acl_e = &acl->a_entries[n]; switch (acl_e->e_tag) { case ACL_USER: acl_e->e_uid = from_vfsuid(idmap, fs_userns, VFSUIDT_INIT(acl_e->e_uid)); break; case ACL_GROUP: acl_e->e_gid = from_vfsgid(idmap, fs_userns, VFSGIDT_INIT(acl_e->e_gid)); break; } } return 0; } /** * vfs_set_acl - set posix acls * @idmap: idmap of the mount * @dentry: the dentry based on which to set the posix acls * @acl_name: the name of the posix acl * @kacl: the posix acls in the appropriate VFS format * * This function sets @kacl. The caller must all posix_acl_release() on @kacl * afterwards. * * Return: On success 0, on error negative errno. */ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { int acl_type; int error; struct inode *inode = d_inode(dentry); struct inode *delegated_inode = NULL; acl_type = posix_acl_type(acl_name); if (acl_type < 0) return -EINVAL; if (kacl) { /* * If we're on an idmapped mount translate from mount specific * vfs{g,u}id_t into global filesystem k{g,u}id_t. * Afterwards we can cache the POSIX ACLs filesystem wide and - * if this is a filesystem with a backing store - ultimately * translate them to backing store values. */ error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl); if (error) return error; } retry_deleg: inode_lock(inode); /* * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; error = security_inode_set_acl(idmap, dentry, acl_name, kacl); if (error) goto out_inode_unlock; error = try_break_deleg(inode, &delegated_inode); if (error) goto out_inode_unlock; if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, kacl); else error = -EIO; if (!error) { fsnotify_xattr(dentry); security_inode_post_set_acl(dentry, acl_name, kacl); } out_inode_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_set_acl); /** * vfs_get_acl - get posix acls * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * * This function retrieves @kacl from the filesystem. The caller must all * posix_acl_release() on @kacl. * * Return: On success POSIX ACLs in VFS format, on error negative errno. */ struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { struct inode *inode = d_inode(dentry); struct posix_acl *acl; int acl_type, error; acl_type = posix_acl_type(acl_name); if (acl_type < 0) return ERR_PTR(-EINVAL); /* * The VFS has no restrictions on reading POSIX ACLs so calling * something like xattr_permission() isn't needed. Only LSMs get a say. */ error = security_inode_get_acl(idmap, dentry, acl_name); if (error) return ERR_PTR(error); if (!IS_POSIXACL(inode)) return ERR_PTR(-EOPNOTSUPP); if (S_ISLNK(inode->i_mode)) return ERR_PTR(-EOPNOTSUPP); acl = __get_acl(idmap, dentry, inode, acl_type); if (IS_ERR(acl)) return acl; if (!acl) return ERR_PTR(-ENODATA); return acl; } EXPORT_SYMBOL_GPL(vfs_get_acl); /** * vfs_remove_acl - remove posix acls * @idmap: idmap of the mount * @dentry: the dentry based on which to retrieve the posix acls * @acl_name: the name of the posix acl * * This function removes posix acls. * * Return: On success 0, on error negative errno. */ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int acl_type; int error; struct inode *inode = d_inode(dentry); struct inode *delegated_inode = NULL; acl_type = posix_acl_type(acl_name); if (acl_type < 0) return -EINVAL; retry_deleg: inode_lock(inode); /* * We only care about restrictions the inode struct itself places upon * us otherwise POSIX ACLs aren't subject to any VFS restrictions. */ error = may_write_xattr(idmap, inode); if (error) goto out_inode_unlock; error = security_inode_remove_acl(idmap, dentry, acl_name); if (error) goto out_inode_unlock; error = try_break_deleg(inode, &delegated_inode); if (error) goto out_inode_unlock; if (likely(!is_bad_inode(inode))) error = set_posix_acl(idmap, dentry, acl_type, NULL); else error = -EIO; if (!error) { fsnotify_xattr(dentry); security_inode_post_remove_acl(idmap, dentry, acl_name); } out_inode_unlock: inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_remove_acl); int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { int error; struct posix_acl *acl = NULL; if (size) { /* * Note that posix_acl_from_xattr() uses GFP_NOFS when it * probably doesn't need to here. */ acl = posix_acl_from_xattr(current_user_ns(), kvalue, size); if (IS_ERR(acl)) return PTR_ERR(acl); } error = vfs_set_acl(idmap, dentry, acl_name, acl); posix_acl_release(acl); return error; } ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { ssize_t error; struct posix_acl *acl; acl = vfs_get_acl(idmap, dentry, acl_name); if (IS_ERR(acl)) return PTR_ERR(acl); error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry), acl, kvalue, size); posix_acl_release(acl); return error; }
5 6 6 6 5 5 6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the * xarray of the same name. Specifically it allows the caller to assert that * the MARK will/will not be changing under the lock, and for devices and * clients, that the value in the xarray is still a valid pointer. Change of * the MARK is linked to the object state, so holding the lock and testing the * MARK also asserts that the contained object is in a certain state. * * This is used to build a two stage register/unregister flow where objects * can continue to be in the xarray even though they are still in progress to * register/unregister. * * The xarray itself provides additional locking, and restartable iteration, * which is also relied on. * * Locks should not be nested, with the exception of client_data, which is * allowed to nest under the read side of the other two locks. * * The devices_rwsem also protects the device name list, any change or * assignment of device name must also hold the write side to guarantee unique * names. */ /* * devices contains devices that have had their names assigned. The * devices may not be registered. Users that care about the registration * status need to call ib_device_try_get() on the device to ensure it is * registered, and keep it registered, for the required duration. * */ static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); static DECLARE_RWSEM(devices_rwsem); #define DEVICE_REGISTERED XA_MARK_1 static u32 highest_client_id; #define CLIENT_REGISTERED XA_MARK_1 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); static DECLARE_RWSEM(clients_rwsem); static void ib_client_put(struct ib_client *client) { if (refcount_dec_and_test(&client->uses)) complete(&client->uses_zero); } /* * If client_data is registered then the corresponding client must also still * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 unsigned int rdma_dev_net_id; /* * A list of net namespaces is maintained in an xarray. This is necessary * because we can't get the locking right using the existing net ns list. We * would require a init_net callback after the list is updated. */ static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); /* * rwsem to protect accessing the rdma_nets xarray entries. */ static DECLARE_RWSEM(rdma_nets_rwsem); bool ib_devices_shared_netns = true; module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); MODULE_PARM_DESC(netns_mode, "Share device among net namespaces; default=1 (shared)"); /** * rdma_dev_access_netns() - Return whether an rdma device can be accessed * from a specified net namespace or not. * @dev: Pointer to rdma device which needs to be checked * @net: Pointer to net namesapce for which access to be checked * * When the rdma device is in shared mode, it ignores the net namespace. * When the rdma device is exclusive to a net namespace, rdma device net * namespace is checked against the specified one. */ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) { return (ib_devices_shared_netns || net_eq(read_pnet(&dev->coredev.rdma_net), net)); } EXPORT_SYMBOL(rdma_dev_access_netns); /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in * the array. This does the same thing as xa_for_each except that it also * returns NULL valued entries if the array is allocating. Simplified to only * work on simple xarrays. */ static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, xa_mark_t filter) { XA_STATE(xas, xa, *indexp); void *entry; rcu_read_lock(); do { entry = xas_find_marked(&xas, ULONG_MAX, filter); if (xa_is_zero(entry)) break; } while (xas_retry(&xas, entry)); rcu_read_unlock(); if (entry) { *indexp = xas.xa_index; if (xa_is_zero(entry)) return NULL; return entry; } return XA_ERROR(-ENOENT); } #define xan_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ !xa_is_err(entry); \ (index)++, entry = xan_find_marked(xa, &(index), filter)) /* RCU hash table mapping netdevice pointers to struct ib_port_data */ static DEFINE_SPINLOCK(ndev_hash_lock); static DECLARE_HASHTABLE(ndev_hash, 5); static void free_netdevs(struct ib_device *ib_dev); static void ib_unregister_work(struct work_struct *work); static void __ib_unregister_device(struct ib_device *device); static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data); static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); static void __ibdev_printk(const char *level, const struct ib_device *ibdev, struct va_format *vaf) { if (ibdev && ibdev->dev.parent) dev_printk_emit(level[1] - '0', ibdev->dev.parent, "%s %s %s: %pV", dev_driver_string(ibdev->dev.parent), dev_name(ibdev->dev.parent), dev_name(&ibdev->dev), vaf); else if (ibdev) printk("%s%s: %pV", level, dev_name(&ibdev->dev), vaf); else printk("%s(NULL ib_device): %pV", level, vaf); } #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ \ va_start(args, fmt); \ \ vaf.fmt = fmt; \ vaf.va = &args; \ \ __ibdev_printk(level, ibdev, &vaf); \ \ va_end(args); \ } \ EXPORT_SYMBOL(func); define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); define_ibdev_printk_level(ibdev_alert, KERN_ALERT); define_ibdev_printk_level(ibdev_crit, KERN_CRIT); define_ibdev_printk_level(ibdev_err, KERN_ERR); define_ibdev_printk_level(ibdev_warn, KERN_WARNING); define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); define_ibdev_printk_level(ibdev_info, KERN_INFO); static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net); /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; struct ib_port_data pdata[]; }; static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(reg_user_mr), IB_MANDATORY_FUNC(dereg_mr), IB_MANDATORY_FUNC(get_port_immutable) }; int i; device->kverbs_provider = true; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) &device->ops + mandatory_table[i].offset)) { device->kverbs_provider = false; break; } } } /* * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { if (!rdma_dev_access_netns(device, net)) { device = NULL; goto out; } if (!ib_device_try_get(device)) device = NULL; } out: up_read(&devices_rwsem); return device; } /** * ib_device_put - Release IB device reference * @device: device whose reference to be released * * ib_device_put() releases reference to the IB device to allow it to be * unregistered and eventually free. */ void ib_device_put(struct ib_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->unreg_completion); } EXPORT_SYMBOL(ib_device_put); static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; unsigned long index; xa_for_each (&devices, index, device) if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } /** * ib_device_get_by_name - Find an IB device by name * @name: The name to look for * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device by its name. The caller must call * ib_device_put() on the returned pointer. */ struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id) { struct ib_device *device; down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && device->ops.driver_id != driver_id) device = NULL; if (device) { if (!ib_device_try_get(device)) device = NULL; } up_read(&devices_rwsem); return device; } EXPORT_SYMBOL(ib_device_get_by_name); static int rename_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; int ret = 0; mutex_lock(&device->compat_devs_mutex); xa_for_each (&device->compat_devs, index, cdev) { ret = device_rename(&cdev->dev, dev_name(&device->dev)); if (ret) { dev_warn(&cdev->dev, "Fail to rename compatdev to new name %s\n", dev_name(&device->dev)); break; } } mutex_unlock(&device->compat_devs_mutex); return ret; } int ib_device_rename(struct ib_device *ibdev, const char *name) { unsigned long index; void *client_data; int ret; down_write(&devices_rwsem); if (!strcmp(name, dev_name(&ibdev->dev))) { up_write(&devices_rwsem); return 0; } if (__ib_device_get_by_name(name)) { up_write(&devices_rwsem); return -EEXIST; } ret = device_rename(&ibdev->dev, name); if (ret) { up_write(&devices_rwsem); return ret; } strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); ret = rename_compat_devs(ibdev); downgrade_write(&devices_rwsem); down_read(&ibdev->client_data_rwsem); xan_for_each_marked(&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->rename) continue; client->rename(ibdev, client_data); } up_read(&ibdev->client_data_rwsem); rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); up_read(&devices_rwsem); return 0; } int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) { if (use_dim > 1) return -EINVAL; ibdev->use_cq_dim = use_dim; return 0; } static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; unsigned long index; struct ida inuse; int rc; int i; lockdep_assert_held_write(&devices_rwsem); ida_init(&inuse); xa_for_each (&devices, index, device) { char buf[IB_DEVICE_NAME_MAX]; if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= INT_MAX) continue; snprintf(buf, sizeof buf, name, i); if (strcmp(buf, dev_name(&device->dev)) != 0) continue; rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); if (rc < 0) goto out; } rc = ida_alloc(&inuse, GFP_KERNEL); if (rc < 0) goto out; rc = dev_set_name(&ibdev->dev, name, rc); out: ida_destroy(&inuse); return rc; } static void ib_device_release(struct device *device) { struct ib_device *dev = container_of(device, struct ib_device, dev); free_netdevs(dev); WARN_ON(refcount_read(&dev->refcount)); if (dev->hw_stats_data) ib_device_release_hw_stats(dev->hw_stats_data); if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } mutex_destroy(&dev->subdev_lock); mutex_destroy(&dev->unregistration_lock); mutex_destroy(&dev->compat_devs_mutex); xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); } static int ib_device_uevent(const struct device *device, struct kobj_uevent_env *env) { if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* * It would be nice to pass the node GUID with the event... */ return 0; } static const void *net_namespace(const struct device *d) { const struct ib_core_device *coredev = container_of(d, struct ib_core_device, dev); return read_pnet(&coredev->rdma_net); } static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, }; static void rdma_init_coredev(struct ib_core_device *coredev, struct ib_device *dev, struct net *net) { bool is_full_dev = &dev->coredev == coredev; /* This BUILD_BUG_ON is intended to catch layout change * of union of ib_core_device and device. * dev must be the first element as ib_core and providers * driver uses it. Adding anything in ib_core_device before * device will break this assumption. */ BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != offsetof(struct ib_device, dev)); coredev->dev.class = &ib_class; coredev->dev.groups = dev->groups; /* * Don't expose hw counters outside of the init namespace. */ if (!is_full_dev && dev->hw_stats_attr_index) coredev->dev.groups[dev->hw_stats_attr_index] = NULL; device_initialize(&coredev->dev); coredev->owner = dev; INIT_LIST_HEAD(&coredev->port_list); write_pnet(&coredev->rdma_net, net); } /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *_ib_alloc_device(size_t size) { struct ib_device *device; unsigned int i; if (WARN_ON(size < sizeof(struct ib_device))) return NULL; device = kzalloc(size, GFP_KERNEL); if (!device) return NULL; if (rdma_restrack_init(device)) { kfree(device); return NULL; } rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); init_rwsem(&device->event_handler_rwsem); mutex_init(&device->unregistration_lock); /* * client_data needs to be alloc because we don't want our mark to be * destroyed if the user stores NULL in the client data. */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); spin_lock_init(&device->cq_pools_lock); for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) INIT_LIST_HEAD(&device->cq_pools[i]); rwlock_init(&device->cache_lock); device->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); mutex_init(&device->subdev_lock); INIT_LIST_HEAD(&device->subdev_list_head); INIT_LIST_HEAD(&device->subdev_list); return device; } EXPORT_SYMBOL(_ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->ops.dealloc_driver) device->ops.dealloc_driver(device); /* * ib_unregister_driver() requires all devices to remain in the xarray * while their ops are callable. The last op we call is dealloc_driver * above. This is needed to create a fence on op callbacks prior to * allowing the driver module to unload. */ down_write(&devices_rwsem); if (xa_load(&devices, device->index) == device) xa_erase(&devices, device->index); up_write(&devices_rwsem); /* Expedite releasing netdev references */ free_netdevs(device); WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); /* Balances with device_initialize */ put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); /* * add_client_context() and remove_client_context() must be safe against * parallel calls on the same device - registration/unregistration of both the * device and client can be occurring in parallel. * * The routines need to be a fence, any caller must not return until the add * or remove is fully completed. */ static int add_client_context(struct ib_device *device, struct ib_client *client) { int ret = 0; if (!device->kverbs_provider && !client->no_kverbs_req) return 0; down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device * unregistration locks. */ if (!refcount_inc_not_zero(&client->uses)) goto out_unlock; refcount_inc(&device->refcount); /* * Another caller to add_client_context got here first and has already * completely initialized context. */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { if (client->add(device)) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED); up_read(&device->client_data_rwsem); return 0; out: ib_device_put(device); ib_client_put(client); out_unlock: up_write(&device->client_data_rwsem); return ret; } static void remove_client_context(struct ib_device *device, unsigned int client_id) { struct ib_client *client; void *client_data; down_write(&device->client_data_rwsem); if (!xa_get_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED)) { up_write(&device->client_data_rwsem); return; } client_data = xa_load(&device->client_data, client_id); xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); client = xa_load(&clients, client_id); up_write(&device->client_data_rwsem); /* * Notice we cannot be holding any exclusive locks when calling the * remove callback as the remove callback can recurse back into any * public functions in this module and thus try for any locks those * functions take. * * For this reason clients and drivers should not call the * unregistration functions will holdling any locks. */ if (client->remove) client->remove(device, client_data); xa_erase(&device->client_data, client_id); ib_device_put(device); ib_client_put(client); } static int alloc_port_data(struct ib_device *device) { struct ib_port_data_rcu *pdata_rcu; u32 port; if (device->port_data) return 0; /* This can only be called once the physical port range is defined */ if (WARN_ON(!device->phys_port_cnt)) return -EINVAL; /* Reserve U32_MAX so the logic to go over all the ports is sane */ if (WARN_ON(device->phys_port_cnt == U32_MAX)) return -EINVAL; /* * device->port_data is indexed directly by the port number to make * access to this data as efficient as possible. * * Therefore port_data is declared as a 1 based array with potential * empty slots at the beginning. */ pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, size_add(rdma_end_port(device), 1)), GFP_KERNEL); if (!pdata_rcu) return -ENOMEM; /* * The rcu_head is put in front of the port data array and the stored * pointer is adjusted since we never need to see that member until * kfree_rcu. */ device->port_data = pdata_rcu->pdata; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; pdata->ib_dev = device; spin_lock_init(&pdata->pkey_list_lock); INIT_LIST_HEAD(&pdata->pkey_list); spin_lock_init(&pdata->netdev_lock); INIT_HLIST_NODE(&pdata->ndev_hash_link); } return 0; } static int verify_immutable(const struct ib_device *dev, u32 port) { return WARN_ON(!rdma_cap_ib_mad(dev, port) && rdma_max_mad_size(dev, port) != 0); } static int setup_port_data(struct ib_device *device) { u32 port; int ret; ret = alloc_port_data(device); if (ret) return ret; rdma_for_each_port (device, port) { struct ib_port_data *pdata = &device->port_data[port]; ret = device->ops.get_port_immutable(device, port, &pdata->immutable); if (ret) return ret; if (verify_immutable(device, port)) return -EINVAL; } return 0; } /** * ib_port_immutable_read() - Read rdma port's immutable data * @dev: IB device * @port: port number whose immutable data to read. It starts with index 1 and * valid upto including rdma_end_port(). */ const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port) { WARN_ON(!rdma_is_port_valid(dev, port)); return &dev->port_data[port].immutable; } EXPORT_SYMBOL(ib_port_immutable_read); void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->ops.get_dev_fw_str) dev->ops.get_dev_fw_str(dev, str); else str[0] = '\0'; } EXPORT_SYMBOL(ib_get_device_fw_str); static void ib_policy_change_task(struct work_struct *work) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned int i; rdma_for_each_port (dev, i) { u64 sp; ib_get_cached_subnet_prefix(dev, i, &sp); ib_security_cache_change(dev, i, sp); } } up_read(&devices_rwsem); } static int ib_security_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; schedule_work(&ib_policy_change_work); ib_mad_agent_security_change(); return NOTIFY_OK; } static void compatdev_release(struct device *dev) { struct ib_core_device *cdev = container_of(dev, struct ib_core_device, dev); kfree(cdev); } static int add_one_compat_dev(struct ib_device *device, struct rdma_dev_net *rnet) { struct ib_core_device *cdev; int ret; lockdep_assert_held(&rdma_nets_rwsem); if (!ib_devices_shared_netns) return 0; /* * Create and add compat device in all namespaces other than where it * is currently bound to. */ if (net_eq(read_pnet(&rnet->net), read_pnet(&device->coredev.rdma_net))) return 0; /* * The first of init_net() or ib_register_device() to take the * compat_devs_mutex wins and gets to add the device. Others will wait * for completion here. */ mutex_lock(&device->compat_devs_mutex); cdev = xa_load(&device->compat_devs, rnet->id); if (cdev) { ret = 0; goto done; } ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); if (ret) goto done; cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) { ret = -ENOMEM; goto cdev_err; } cdev->dev.parent = device->dev.parent; rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); cdev->dev.release = compatdev_release; ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); if (ret) goto add_err; ret = device_add(&cdev->dev); if (ret) goto add_err; ret = ib_setup_port_attrs(cdev); if (ret) goto port_err; ret = xa_err(xa_store(&device->compat_devs, rnet->id, cdev, GFP_KERNEL)); if (ret) goto insert_err; mutex_unlock(&device->compat_devs_mutex); return 0; insert_err: ib_free_port_attrs(cdev); port_err: device_del(&cdev->dev); add_err: put_device(&cdev->dev); cdev_err: xa_release(&device->compat_devs, rnet->id); done: mutex_unlock(&device->compat_devs_mutex); return ret; } static void remove_one_compat_dev(struct ib_device *device, u32 id) { struct ib_core_device *cdev; mutex_lock(&device->compat_devs_mutex); cdev = xa_erase(&device->compat_devs, id); mutex_unlock(&device->compat_devs_mutex); if (cdev) { ib_free_port_attrs(cdev); device_del(&cdev->dev); put_device(&cdev->dev); } } static void remove_compat_devs(struct ib_device *device) { struct ib_core_device *cdev; unsigned long index; xa_for_each (&device->compat_devs, index, cdev) remove_one_compat_dev(device, index); } static int add_compat_devs(struct ib_device *device) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; lockdep_assert_held(&devices_rwsem); down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, index, rnet) { ret = add_one_compat_dev(device, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); return ret; } static void remove_all_compat_devs(void) { struct ib_compat_device *cdev; struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { unsigned long c_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&dev->compat_devs, c_index, cdev) remove_one_compat_dev(dev, c_index); up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); } static int add_all_compat_devs(void) { struct rdma_dev_net *rnet; struct ib_device *dev; unsigned long index; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { unsigned long net_index = 0; /* Hold nets_rwsem so that any other thread modifying this * system param can sync with this thread. */ down_read(&rdma_nets_rwsem); xa_for_each (&rdma_nets, net_index, rnet) { ret = add_one_compat_dev(dev, rnet); if (ret) break; } up_read(&rdma_nets_rwsem); } up_read(&devices_rwsem); if (ret) remove_all_compat_devs(); return ret; } int rdma_compatdev_set(u8 enable) { struct rdma_dev_net *rnet; unsigned long index; int ret = 0; down_write(&rdma_nets_rwsem); if (ib_devices_shared_netns == enable) { up_write(&rdma_nets_rwsem); return 0; } /* enable/disable of compat devices is not supported * when more than default init_net exists. */ xa_for_each (&rdma_nets, index, rnet) { ret++; break; } if (!ret) ib_devices_shared_netns = enable; up_write(&rdma_nets_rwsem); if (ret) return -EBUSY; if (enable) ret = add_all_compat_devs(); else remove_all_compat_devs(); return ret; } static void rdma_dev_exit_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); struct ib_device *dev; unsigned long index; int ret; down_write(&rdma_nets_rwsem); /* * Prevent the ID from being re-used and hide the id from xa_for_each. */ ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); WARN_ON(ret); up_write(&rdma_nets_rwsem); down_read(&devices_rwsem); xa_for_each (&devices, index, dev) { get_device(&dev->dev); /* * Release the devices_rwsem so that pontentially blocking * device_del, doesn't hold the devices_rwsem for too long. */ up_read(&devices_rwsem); remove_one_compat_dev(dev, rnet->id); /* * If the real device is in the NS then move it back to init. */ rdma_dev_change_netns(dev, net, &init_net); put_device(&dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); rdma_nl_net_exit(rnet); xa_erase(&rdma_nets, rnet->id); } static __net_init int rdma_dev_init_net(struct net *net) { struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); unsigned long index; struct ib_device *dev; int ret; write_pnet(&rnet->net, net); ret = rdma_nl_net_init(rnet); if (ret) return ret; /* No need to create any compat devices in default init_net. */ if (net_eq(net, &init_net)) return 0; ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); if (ret) { rdma_nl_net_exit(rnet); return ret; } down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { /* Hold nets_rwsem so that netlink command cannot change * system configuration for device sharing mode. */ down_read(&rdma_nets_rwsem); ret = add_one_compat_dev(dev, rnet); up_read(&rdma_nets_rwsem); if (ret) break; } up_read(&devices_rwsem); if (ret) rdma_dev_exit_net(net); return ret; } /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. */ static int assign_name(struct ib_device *device, const char *name) { static u32 last_id; int ret; down_write(&devices_rwsem); /* Assign a unique name to the device */ if (strchr(name, '%')) ret = alloc_name(device, name); else ret = dev_set_name(&device->dev, name); if (ret) goto out; if (__ib_device_get_by_name(dev_name(&device->dev))) { ret = -ENFILE; goto out; } strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, &last_id, GFP_KERNEL); if (ret > 0) ret = 0; out: up_write(&devices_rwsem); return ret; } /* * setup_device() allocates memory and sets up data that requires calling the * device ops, this is the only reason these actions are not done during * ib_alloc_device. It is undone by ib_dealloc_device(). */ static int setup_device(struct ib_device *device) { struct ib_udata uhw = {.outlen = 0, .inlen = 0}; int ret; ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { dev_warn(&device->dev, "Couldn't create per-port data\n"); return ret; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->ops.query_device(device, &device->attrs, &uhw); if (ret) { dev_warn(&device->dev, "Couldn't query the device attributes\n"); return ret; } return 0; } static void disable_device(struct ib_device *device) { u32 cid; WARN_ON(!refcount_read(&device->refcount)); down_write(&devices_rwsem); xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); up_write(&devices_rwsem); /* * Remove clients in LIFO order, see assign_client_id. This could be * more efficient if xarray learns to reverse iterate. Since no new * clients can be added to this ib_device past this point we only need * the maximum possible client_id value here. */ down_read(&clients_rwsem); cid = highest_client_id; up_read(&clients_rwsem); while (cid) { cid--; remove_client_context(device, cid); } ib_cq_pool_cleanup(device); /* Pairs with refcount_set in enable_device */ ib_device_put(device); wait_for_completion(&device->unreg_completion); /* * compat devices must be removed after device refcount drops to zero. * Otherwise init_net() may add more compatdevs after removing compat * devices and before device is disabled. */ remove_compat_devs(device); } /* * An enabled device is visible to all clients and to all the public facing * APIs that return a device pointer. This always returns with a new get, even * if it fails. */ static int enable_device_and_get(struct ib_device *device) { struct ib_client *client; unsigned long index; int ret = 0; /* * One ref belongs to the xa and the other belongs to this * thread. This is needed to guard against parallel unregistration. */ refcount_set(&device->refcount, 2); down_write(&devices_rwsem); xa_set_mark(&devices, device->index, DEVICE_REGISTERED); /* * By using downgrade_write() we ensure that no other thread can clear * DEVICE_REGISTERED while we are completing the client setup. */ downgrade_write(&devices_rwsem); if (device->ops.enable_driver) { ret = device->ops.enable_driver(device); if (ret) goto out; } down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { ret = add_client_context(device, client); if (ret) break; } up_read(&clients_rwsem); if (!ret) ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; } static void prevent_dealloc_device(struct ib_device *ib_dev) { } static void ib_device_notify_register(struct ib_device *device) { struct net_device *netdev; u32 port; int ret; down_read(&devices_rwsem); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); if (ret) goto out; rdma_for_each_port(device, port) { netdev = ib_device_get_netdev(device, port); if (!netdev) continue; ret = rdma_nl_notify_event(device, port, RDMA_NETDEV_ATTACH_EVENT); dev_put(netdev); if (ret) goto out; } out: up_read(&devices_rwsem); } /** * ib_register_device - Register an IB device with IB core * @device: Device to register * @name: unique string device name. This may include a '%' which will * cause a unique index to be added to the passed device name. * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB * device will be used. In this case the caller should fully * setup the ibdev for DMA. This usually means using dma_virt_ops. * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). * * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() * asynchronously then the device pointer may become freed as soon as this * function returns. */ int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device) { int ret; ret = assign_name(device, name); if (ret) return ret; /* * If the caller does not provide a DMA capable device then the IB core * will set up ib_sge and scatterlist structures that stash the kernel * virtual address into the address field. */ WARN_ON(dma_device && !dma_device->dma_parms); device->dma_device = dma_device; ret = setup_device(device); if (ret) return ret; ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); return ret; } device->groups[0] = &ib_dev_attr_group; device->groups[1] = device->ops.device_group; ret = ib_setup_device_attrs(device); if (ret) goto cache_cleanup; ib_device_register_rdmacg(device); rdma_counter_init(device); /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. */ dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; ret = ib_setup_port_attrs(&device->coredev); if (ret) { dev_warn(&device->dev, "Couldn't register device with driver model\n"); goto dev_cleanup; } ret = enable_device_and_get(device); if (ret) { void (*dealloc_fn)(struct ib_device *); /* * If we hit this error flow then we don't want to * automatically dealloc the device since the caller is * expected to call ib_dealloc_device() after * ib_register_device() fails. This is tricky due to the * possibility for a parallel unregistration along with this * error flow. Since we have a refcount here we know any * parallel flow is stopped in disable_device and will see the * special dealloc_driver pointer, causing the responsibility to * ib_dealloc_device() to revert back to this thread. */ dealloc_fn = device->ops.dealloc_driver; device->ops.dealloc_driver = prevent_dealloc_device; ib_device_put(device); __ib_unregister_device(device); device->ops.dealloc_driver = dealloc_fn; dev_set_uevent_suppress(&device->dev, false); return ret; } dev_set_uevent_suppress(&device->dev, false); ib_device_notify_register(device); ib_device_put(device); return 0; dev_cleanup: device_del(&device->dev); cg_cleanup: dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); cache_cleanup: ib_cache_cleanup_one(device); return ret; } EXPORT_SYMBOL(ib_register_device); /* Callers must hold a get on the device. */ static void __ib_unregister_device(struct ib_device *ib_dev) { struct ib_device *sub, *tmp; mutex_lock(&ib_dev->subdev_lock); list_for_each_entry_safe_reverse(sub, tmp, &ib_dev->subdev_list_head, subdev_list) { list_del(&sub->subdev_list); ib_dev->ops.del_sub_dev(sub); ib_device_put(ib_dev); } mutex_unlock(&ib_dev->subdev_lock); /* * We have a registration lock so that all the calls to unregister are * fully fenced, once any unregister returns the device is truely * unregistered even if multiple callers are unregistering it at the * same time. This also interacts with the registration flow and * provides sane semantics if register and unregister are racing. */ mutex_lock(&ib_dev->unregistration_lock); if (!refcount_read(&ib_dev->refcount)) goto out; disable_device(ib_dev); rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); ib_free_port_attrs(&ib_dev->coredev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); ib_cache_cleanup_one(ib_dev); /* * Drivers using the new flow may not call ib_dealloc_device except * in error unwind prior to registration success. */ if (ib_dev->ops.dealloc_driver && ib_dev->ops.dealloc_driver != prevent_dealloc_device) { WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); ib_dealloc_device(ib_dev); } out: mutex_unlock(&ib_dev->unregistration_lock); } /** * ib_unregister_device - Unregister an IB device * @ib_dev: The device to unregister * * Unregister an IB device. All clients will receive a remove callback. * * Callers should call this routine only once, and protect against races with * registration. Typically it should only be called as part of a remove * callback in an implementation of driver core's struct device_driver and * related. * * If ops.dealloc_driver is used then ib_dev will be freed upon return from * this function. */ void ib_unregister_device(struct ib_device *ib_dev) { get_device(&ib_dev->dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device); /** * ib_unregister_device_and_put - Unregister a device while holding a 'get' * @ib_dev: The device to unregister * * This is the same as ib_unregister_device(), except it includes an internal * ib_device_put() that should match a 'get' obtained by the caller. * * It is safe to call this routine concurrently from multiple threads while * holding the 'get'. When the function returns the device is fully * unregistered. * * Drivers using this flow MUST use the driver_unregister callback to clean up * their resources associated with the device and dealloc it. */ void ib_unregister_device_and_put(struct ib_device *ib_dev) { WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); ib_device_put(ib_dev); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_and_put); /** * ib_unregister_driver - Unregister all IB devices for a driver * @driver_id: The driver to unregister * * This implements a fence for device unregistration. It only returns once all * devices associated with the driver_id have fully completed their * unregistration and returned from ib_unregister_device*(). * * If device's are not yet unregistered it goes ahead and starts unregistering * them. * * This does not block creation of new devices with the given driver_id, that * is the responsibility of the caller. */ void ib_unregister_driver(enum rdma_driver_id driver_id) { struct ib_device *ib_dev; unsigned long index; down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); up_read(&devices_rwsem); WARN_ON(!ib_dev->ops.dealloc_driver); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); down_read(&devices_rwsem); } up_read(&devices_rwsem); } EXPORT_SYMBOL(ib_unregister_driver); static void ib_unregister_work(struct work_struct *work) { struct ib_device *ib_dev = container_of(work, struct ib_device, unregistration_work); __ib_unregister_device(ib_dev); put_device(&ib_dev->dev); } /** * ib_unregister_device_queued - Unregister a device using a work queue * @ib_dev: The device to unregister * * This schedules an asynchronous unregistration using a WQ for the device. A * driver should use this to avoid holding locks while doing unregistration, * such as holding the RTNL lock. * * Drivers using this API must use ib_unregister_driver before module unload * to ensure that all scheduled unregistrations have completed. */ void ib_unregister_device_queued(struct ib_device *ib_dev) { WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); /* * The caller must pass in a device that has the kref held and the refcount * released. If the device is in cur_net and still registered then it is moved * into net. */ static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, struct net *net) { int ret2 = -EINVAL; int ret; mutex_lock(&device->unregistration_lock); /* * If a device not under ib_device_get() or if the unregistration_lock * is not held, the namespace can be changed, or it can be unregistered. * Check again under the lock. */ if (refcount_read(&device->refcount) == 0 || !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { ret = -ENODEV; goto out; } kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); disable_device(device); /* * At this point no one can be using the device, so it is safe to * change the namespace. */ write_pnet(&device->coredev.rdma_net, net); down_read(&devices_rwsem); /* * Currently rdma devices are system wide unique. So the device name * is guaranteed free in the new namespace. Publish the new namespace * at the sysfs level. */ ret = device_rename(&device->dev, dev_name(&device->dev)); up_read(&devices_rwsem); if (ret) { dev_warn(&device->dev, "%s: Couldn't rename device after namespace change\n", __func__); /* Try and put things back and re-enable the device */ write_pnet(&device->coredev.rdma_net, cur_net); } ret2 = enable_device_and_get(device); if (ret2) { /* * This shouldn't really happen, but if it does, let the user * retry at later point. So don't disable the device. */ dev_warn(&device->dev, "%s: Couldn't re-enable device after namespace change\n", __func__); } kobject_uevent(&device->dev.kobj, KOBJ_ADD); ib_device_put(device); out: mutex_unlock(&device->unregistration_lock); if (ret) return ret; return ret2; } int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd) { struct net *net; int ret; net = get_net_ns_by_fd(ns_fd); if (IS_ERR(net)) { ret = PTR_ERR(net); goto net_err; } if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; goto ns_err; } /* * All the ib_clients, including uverbs, are reset when the namespace is * changed and this cannot be blocked waiting for userspace to do * something, so disassociation is mandatory. */ if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { ret = -EOPNOTSUPP; goto ns_err; } get_device(&dev->dev); ib_device_put(dev); ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); put_device(&dev->dev); put_net(net); return ret; ns_err: put_net(net); net_err: ib_device_put(dev); return ret; } static struct pernet_operations rdma_dev_net_ops = { .init = rdma_dev_init_net, .exit = rdma_dev_exit_net, .id = &rdma_dev_net_id, .size = sizeof(struct rdma_dev_net), }; static int assign_client_id(struct ib_client *client) { int ret; lockdep_assert_held(&clients_rwsem); /* * The add/remove callbacks must be called in FIFO/LIFO order. To * achieve this we assign client_ids so they are sorted in * registration order. */ client->client_id = highest_client_id; ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); if (ret) return ret; highest_client_id++; xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); return 0; } static void remove_client_id(struct ib_client *client) { down_write(&clients_rwsem); xa_erase(&clients, client->client_id); for (; highest_client_id; highest_client_id--) if (xa_load(&clients, highest_client_id - 1)) break; up_write(&clients_rwsem); } /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; unsigned long index; bool need_unreg = false; int ret; refcount_set(&client->uses, 1); init_completion(&client->uses_zero); /* * The devices_rwsem is held in write mode to ensure that a racing * ib_register_device() sees a consisent view of clients and devices. */ down_write(&devices_rwsem); down_write(&clients_rwsem); ret = assign_client_id(client); if (ret) goto out; need_unreg = true; xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { ret = add_client_context(device, client); if (ret) goto out; } ret = 0; out: up_write(&clients_rwsem); up_write(&devices_rwsem); if (need_unreg && ret) ib_unregister_client(client); return ret; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. * * This is a full fence, once it returns no client callbacks will be called, * or are running in another thread. */ void ib_unregister_client(struct ib_client *client) { struct ib_device *device; unsigned long index; down_write(&clients_rwsem); ib_client_put(client); xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); up_write(&clients_rwsem); /* We do not want to have locks while calling client->remove() */ rcu_read_lock(); xa_for_each (&devices, index, device) { if (!ib_device_try_get(device)) continue; rcu_read_unlock(); remove_client_context(device, client->client_id); ib_device_put(device); rcu_read_lock(); } rcu_read_unlock(); /* * remove_client_context() is not a fence, it can return even though a * removal is ongoing. Wait until all removals are completed. */ wait_for_completion(&client->uses_zero); remove_client_id(client); } EXPORT_SYMBOL(ib_unregister_client); static int __ib_get_global_client_nl_info(const char *client_name, struct ib_client_nl_info *res) { struct ib_client *client; unsigned long index; int ret = -ENOENT; down_read(&clients_rwsem); xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { if (strcmp(client->name, client_name) != 0) continue; if (!client->get_global_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_global_nl_info(res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&clients_rwsem); return ret; } static int __ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { unsigned long index; void *client_data; int ret = -ENOENT; down_read(&ibdev->client_data_rwsem); xan_for_each_marked (&ibdev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || strcmp(client->name, client_name) != 0) continue; if (!client->get_nl_info) { ret = -EOPNOTSUPP; break; } ret = client->get_nl_info(ibdev, client_data, res); if (WARN_ON(ret == -ENOENT)) ret = -EINVAL; /* * The cdev is guaranteed valid as long as we are inside the * client_data_rwsem as remove_one can't be called. Keep it * valid for the caller. */ if (!ret && res->cdev) get_device(res->cdev); break; } up_read(&ibdev->client_data_rwsem); return ret; } /** * ib_get_client_nl_info - Fetch the nl_info from a client * @ibdev: IB device * @client_name: Name of the client * @res: Result of the query */ int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res) { int ret; if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); #ifdef CONFIG_MODULES if (ret == -ENOENT) { request_module("rdma-client-%s", client_name); if (ibdev) ret = __ib_get_client_nl_info(ibdev, client_name, res); else ret = __ib_get_global_client_nl_info(client_name, res); } #endif if (ret) { if (ret == -ENOENT) return -EOPNOTSUPP; return ret; } if (WARN_ON(!res->cdev)) return -EINVAL; return 0; } /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context data that can be retrieved with * ib_get_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { void *rc; if (WARN_ON(IS_ERR(data))) data = NULL; rc = xa_store(&device->client_data, client->client_id, data, GFP_KERNEL); WARN_ON(xa_is_err(rc)); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback occurs in workqueue context. */ void ib_register_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ void ib_unregister_event_handler(struct ib_event_handler *event_handler) { down_write(&event_handler->device->event_handler_rwsem); list_del(&event_handler->list); up_write(&event_handler->device->event_handler_rwsem); } EXPORT_SYMBOL(ib_unregister_event_handler); void ib_dispatch_event_clients(struct ib_event *event) { struct ib_event_handler *handler; down_read(&event->device->event_handler_rwsem); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); up_read(&event->device->event_handler_rwsem); } static int iw_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { struct in_device *inetdev; struct net_device *netdev; memset(port_attr, 0, sizeof(*port_attr)); netdev = ib_device_get_netdev(device, port_num); if (!netdev) return -ENODEV; port_attr->max_mtu = IB_MTU_4096; port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); if (!netif_carrier_ok(netdev)) { port_attr->state = IB_PORT_DOWN; port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; } else { rcu_read_lock(); inetdev = __in_dev_get_rcu(netdev); if (inetdev && inetdev->ifa_list) { port_attr->state = IB_PORT_ACTIVE; port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; } else { port_attr->state = IB_PORT_INIT; port_attr->phys_state = IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; } rcu_read_unlock(); } dev_put(netdev); return device->ops.query_port(device, port_num, port_attr); } static int __ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { int err; memset(port_attr, 0, sizeof(*port_attr)); err = device->ops.query_port(device, port_num, port_attr); if (err || port_attr->subnet_prefix) return err; if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) return 0; ib_get_cached_subnet_prefix(device, port_num, &port_attr->subnet_prefix); return 0; } /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (rdma_protocol_iwarp(device, port_num)) return iw_query_port(device, port_num, port_attr); else return __ib_query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); static void add_ndev_hash(struct ib_port_data *pdata) { unsigned long flags; might_sleep(); spin_lock_irqsave(&ndev_hash_lock, flags); if (hash_hashed(&pdata->ndev_hash_link)) { hash_del_rcu(&pdata->ndev_hash_link); spin_unlock_irqrestore(&ndev_hash_lock, flags); /* * We cannot do hash_add_rcu after a hash_del_rcu until the * grace period */ synchronize_rcu(); spin_lock_irqsave(&ndev_hash_lock, flags); } if (pdata->netdev) hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, (uintptr_t)pdata->netdev); spin_unlock_irqrestore(&ndev_hash_lock, flags); } /** * ib_device_set_netdev - Associate the ib_dev with an underlying net_device * @ib_dev: Device to modify * @ndev: net_device to affiliate, may be NULL * @port: IB port the net_device is connected to * * Drivers should use this to link the ib_device to a netdev so the netdev * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be * affiliated with any port. * * The caller must ensure that the given ndev is not unregistered or * unregistering, and that either the ib_device is unregistered or * ib_device_set_netdev() is called with NULL when the ndev sends a * NETDEV_UNREGISTER event. */ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; int ret; if (!rdma_is_port_valid(ib_dev, port)) return -EINVAL; /* * Drivers wish to call this before ib_register_driver, so we have to * setup the port data early. */ ret = alloc_port_data(ib_dev); if (ret) return ret; pdata = &ib_dev->port_data[port]; spin_lock_irqsave(&pdata->netdev_lock, flags); old_ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (old_ndev == ndev) { spin_unlock_irqrestore(&pdata->netdev_lock, flags); return 0; } rcu_assign_pointer(pdata->netdev, ndev); netdev_put(old_ndev, &pdata->netdev_tracker); netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); /* Make sure that the device is registered before we send events */ if (xa_load(&devices, ib_dev->index) != ib_dev) return 0; etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; rdma_nl_notify_event(ib_dev, port, etype); return 0; } EXPORT_SYMBOL(ib_device_set_netdev); static void free_netdevs(struct ib_device *ib_dev) { unsigned long flags; u32 port; if (!ib_dev->port_data) return; rdma_for_each_port (ib_dev, port) { struct ib_port_data *pdata = &ib_dev->port_data[port]; struct net_device *ndev; spin_lock_irqsave(&pdata->netdev_lock, flags); ndev = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); if (ndev) { spin_lock(&ndev_hash_lock); hash_del_rcu(&pdata->ndev_hash_link); spin_unlock(&ndev_hash_lock); /* * If this is the last dev_put there is still a * synchronize_rcu before the netdev is kfreed, so we * can continue to rely on unlocked pointer * comparisons after the put */ rcu_assign_pointer(pdata->netdev, NULL); netdev_put(ndev, &pdata->netdev_tracker); } spin_unlock_irqrestore(&pdata->netdev_lock, flags); } } struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port) { struct ib_port_data *pdata; struct net_device *res; if (!rdma_is_port_valid(ib_dev, port)) return NULL; if (!ib_dev->port_data) return NULL; pdata = &ib_dev->port_data[port]; /* * New drivers should use ib_device_set_netdev() not the legacy * get_netdev(). */ if (ib_dev->ops.get_netdev) res = ib_dev->ops.get_netdev(ib_dev, port); else { spin_lock(&pdata->netdev_lock); res = rcu_dereference_protected( pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); dev_hold(res); spin_unlock(&pdata->netdev_lock); } return res; } EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_query_netdev_port - Query the port number of a net_device * associated with an ibdev * @ibdev: IB device * @ndev: Network device * @port: IB port the net_device is connected to */ int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port) { struct net_device *ib_ndev; u32 port_num; rdma_for_each_port(ibdev, port_num) { ib_ndev = ib_device_get_netdev(ibdev, port_num); if (ndev == ib_ndev) { *port = port_num; dev_put(ib_ndev); return 0; } dev_put(ib_ndev); } return -ENOENT; } EXPORT_SYMBOL(ib_query_netdev_port); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) * * Find and hold an ib_device that is associated with a netdev via * ib_device_set_netdev(). The caller must call ib_device_put() on the * returned pointer. */ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id) { struct ib_device *res = NULL; struct ib_port_data *cur; rcu_read_lock(); hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; } } rcu_read_unlock(); return res; } EXPORT_SYMBOL(ib_device_get_by_netdev); /** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all of the physical RoCE ports of ib_dev * which are related to netdevice and calls callback() on each * device for which filter() function returns non zero. */ void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { u32 port; rdma_for_each_port (ib_dev, port) if (rdma_protocol_roce(ib_dev, port)) { struct net_device *idev = ib_device_get_netdev(ib_dev, port); if (filter(ib_dev, port, idev, filter_cookie)) cb(ib_dev, port, idev, cookie); dev_put(idev); } } /** * ib_enum_all_roce_netdevs - enumerate all RoCE devices * @filter: Should we call the callback? * @filter_cookie: Cookie passed to filter * @cb: Callback to call for each found RoCE ports * @cookie: Cookie passed back to the callback * * Enumerates all RoCE devices' physical ports which are related * to netdevices and calls callback() on each device for which * filter() function returns non zero. */ void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie) { struct ib_device *dev; unsigned long index; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); up_read(&devices_rwsem); } /* * ib_enum_all_devs - enumerate all ib_devices * @cb: Callback to call for each found ib_device * * Enumerates all ib_devices and calls callback() on each device. */ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb) { unsigned long index; struct ib_device *dev; unsigned int idx = 0; int ret = 0; down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) continue; ret = nldev_cb(dev, skb, cb, idx); if (ret) break; idx++; } up_read(&devices_rwsem); return ret; } /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey) { if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (!device->ops.query_pkey) return -EOPNOTSUPP; return device->ops.query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->ops.modify_device) return -EOPNOTSUPP; return device->ops.modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { int rc; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; if (device->ops.modify_port) rc = device->ops.modify_port(device, port_num, port_modify_mask, port_modify); else if (rdma_protocol_roce(device, port_num) && ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) rc = 0; else rc = -EOPNOTSUPP; return rc; } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index) { union ib_gid tmp_gid; u32 port; int ret, i; rdma_for_each_port (device, port) { if (!rdma_protocol_ib(device, port)) continue; for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) continue; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); /** * ib_get_net_dev_by_params() - Return the appropriate net_dev * for a received CM request * @dev: An RDMA device on which the request has been received. * @port: Port number on the RDMA device. * @pkey: The Pkey the request came on. * @gid: A GID that the net_dev uses to communicate. * @addr: Contains the IP address that the request specified as its * destination. * */ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr) { struct net_device *net_dev = NULL; unsigned long index; void *client_data; if (!rdma_protocol_ib(dev, port)) return NULL; /* * Holding the read side guarantees that the client will not become * unregistered while we are calling get_net_dev_by_params() */ down_read(&dev->client_data_rwsem); xan_for_each_marked (&dev->client_data, index, client_data, CLIENT_DATA_REGISTERED) { struct ib_client *client = xa_load(&clients, index); if (!client || !client->get_net_dev_by_params) continue; net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, addr, client_data); if (net_dev) break; } up_read(&dev->client_data_rwsem); return net_dev; } EXPORT_SYMBOL(ib_get_net_dev_by_params); void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) { struct ib_device_ops *dev_ops = &dev->ops; #define SET_DEVICE_OP(ptr, name) \ do { \ if (ops->name) \ if (!((ptr)->name)) \ (ptr)->name = ops->name; \ } while (0) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && dev_ops->driver_id != ops->driver_id); dev_ops->driver_id = ops->driver_id; } if (ops->owner) { WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); dev_ops->owner = ops->owner; } if (ops->uverbs_abi_ver) dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; dev_ops->uverbs_no_driver_id_binding |= ops->uverbs_no_driver_id_binding; SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); SET_DEVICE_OP(dev_ops, alloc_ucontext); SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); SET_DEVICE_OP(dev_ops, counter_alloc_stats); SET_DEVICE_OP(dev_ops, counter_bind_qp); SET_DEVICE_OP(dev_ops, counter_dealloc); SET_DEVICE_OP(dev_ops, counter_init); SET_DEVICE_OP(dev_ops, counter_unbind_qp); SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); SET_DEVICE_OP(dev_ops, create_srq); SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); SET_DEVICE_OP(dev_ops, dealloc_xrcd); SET_DEVICE_OP(dev_ops, del_gid); SET_DEVICE_OP(dev_ops, del_sub_dev); SET_DEVICE_OP(dev_ops, dereg_mr); SET_DEVICE_OP(dev_ops, destroy_ah); SET_DEVICE_OP(dev_ops, destroy_counters); SET_DEVICE_OP(dev_ops, destroy_cq); SET_DEVICE_OP(dev_ops, destroy_flow); SET_DEVICE_OP(dev_ops, destroy_flow_action); SET_DEVICE_OP(dev_ops, destroy_qp); SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); SET_DEVICE_OP(dev_ops, destroy_srq); SET_DEVICE_OP(dev_ops, destroy_wq); SET_DEVICE_OP(dev_ops, device_group); SET_DEVICE_OP(dev_ops, detach_mcast); SET_DEVICE_OP(dev_ops, disassociate_ucontext); SET_DEVICE_OP(dev_ops, drain_rq); SET_DEVICE_OP(dev_ops, drain_sq); SET_DEVICE_OP(dev_ops, enable_driver); SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry); SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_mr_entry); SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_qp_entry); SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); SET_DEVICE_OP(dev_ops, fill_res_srq_entry); SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); SET_DEVICE_OP(dev_ops, get_dev_fw_str); SET_DEVICE_OP(dev_ops, get_dma_mr); SET_DEVICE_OP(dev_ops, get_hw_stats); SET_DEVICE_OP(dev_ops, get_link_layer); SET_DEVICE_OP(dev_ops, get_netdev); SET_DEVICE_OP(dev_ops, get_numa_node); SET_DEVICE_OP(dev_ops, get_port_immutable); SET_DEVICE_OP(dev_ops, get_vector_affinity); SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_guid); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); SET_DEVICE_OP(dev_ops, iw_create_listen); SET_DEVICE_OP(dev_ops, iw_destroy_listen); SET_DEVICE_OP(dev_ops, iw_get_qp); SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); SET_DEVICE_OP(dev_ops, modify_cq); SET_DEVICE_OP(dev_ops, modify_device); SET_DEVICE_OP(dev_ops, modify_hw_stat); SET_DEVICE_OP(dev_ops, modify_port); SET_DEVICE_OP(dev_ops, modify_qp); SET_DEVICE_OP(dev_ops, modify_srq); SET_DEVICE_OP(dev_ops, modify_wq); SET_DEVICE_OP(dev_ops, peek_cq); SET_DEVICE_OP(dev_ops, poll_cq); SET_DEVICE_OP(dev_ops, port_groups); SET_DEVICE_OP(dev_ops, post_recv); SET_DEVICE_OP(dev_ops, post_send); SET_DEVICE_OP(dev_ops, post_srq_recv); SET_DEVICE_OP(dev_ops, process_mad); SET_DEVICE_OP(dev_ops, query_ah); SET_DEVICE_OP(dev_ops, query_device); SET_DEVICE_OP(dev_ops, query_gid); SET_DEVICE_OP(dev_ops, query_pkey); SET_DEVICE_OP(dev_ops, query_port); SET_DEVICE_OP(dev_ops, query_qp); SET_DEVICE_OP(dev_ops, query_srq); SET_DEVICE_OP(dev_ops, query_ucontext); SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); SET_DEVICE_OP(dev_ops, read_counters); SET_DEVICE_OP(dev_ops, reg_dm_mr); SET_DEVICE_OP(dev_ops, reg_user_mr); SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); SET_DEVICE_OP(dev_ops, req_notify_cq); SET_DEVICE_OP(dev_ops, rereg_user_mr); SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); SET_OBJ_SIZE(dev_ops, ib_xrcd); SET_OBJ_SIZE(dev_ops, rdma_counter); } EXPORT_SYMBOL(ib_set_device_ops); int ib_add_sub_device(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name) { struct ib_device *sub; int ret = 0; if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) return -EOPNOTSUPP; if (!ib_device_try_get(parent)) return -EINVAL; sub = parent->ops.add_sub_dev(parent, type, name); if (IS_ERR(sub)) { ib_device_put(parent); return PTR_ERR(sub); } sub->type = type; sub->parent = parent; mutex_lock(&parent->subdev_lock); list_add_tail(&parent->subdev_list_head, &sub->subdev_list); mutex_unlock(&parent->subdev_lock); return ret; } EXPORT_SYMBOL(ib_add_sub_device); int ib_del_sub_device_and_put(struct ib_device *sub) { struct ib_device *parent = sub->parent; if (!parent) return -EOPNOTSUPP; mutex_lock(&parent->subdev_lock); list_del(&sub->subdev_list); mutex_unlock(&parent->subdev_lock); ib_device_put(sub); parent->ops.del_sub_dev(sub); ib_device_put(parent); return 0; } EXPORT_SYMBOL(ib_del_sub_device_and_put); #ifdef CONFIG_INFINIBAND_VIRT_DMA int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) { struct scatterlist *s; int i; for_each_sg(sg, s, nents, i) { sg_dma_address(s) = (uintptr_t)sg_virt(s); sg_dma_len(s) = s->length; } return nents; } EXPORT_SYMBOL(ib_dma_virt_map_sg); #endif /* CONFIG_INFINIBAND_VIRT_DMA */ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { [RDMA_NL_LS_OP_RESOLVE] = { .doit = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { .doit = ib_nl_handle_set_timeout, .flags = RDMA_NL_ADMIN_PERM, }, [RDMA_NL_LS_OP_IP_RESOLVE] = { .doit = ib_nl_handle_ip_res_resp, .flags = RDMA_NL_ADMIN_PERM, }, }; void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) { enum ib_port_state curr_state; struct ib_event ibevent = {}; u32 port; if (ib_query_netdev_port(ibdev, ndev, &port)) return; curr_state = ib_get_curr_port_state(ndev); write_lock_irq(&ibdev->cache_lock); if (ibdev->port_data[port].cache.last_port_state == curr_state) { write_unlock_irq(&ibdev->cache_lock); return; } ibdev->port_data[port].cache.last_port_state = curr_state; write_unlock_irq(&ibdev->cache_lock); ibevent.event = (curr_state == IB_PORT_DOWN) ? IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; ibevent.device = ibdev; ibevent.element.port_num = port; ib_dispatch_event(&ibevent); } EXPORT_SYMBOL(ib_dispatch_port_state_event); static void handle_port_event(struct net_device *ndev, unsigned long event) { struct ib_device *ibdev; /* Currently, link events in bonding scenarios are still * reported by drivers that support bonding. */ if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) return; ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return; if (ibdev->ops.report_port_event) { ibdev->ops.report_port_event(ibdev, ndev, event); goto put_ibdev; } ib_dispatch_port_state_event(ibdev, ndev); put_ibdev: ib_device_put(ibdev); }; static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct ib_device *ibdev; u32 port; switch (event) { case NETDEV_CHANGENAME: ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); if (!ibdev) return NOTIFY_DONE; if (ib_query_netdev_port(ibdev, ndev, &port)) { ib_device_put(ibdev); break; } rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; case NETDEV_UP: case NETDEV_CHANGE: case NETDEV_DOWN: handle_port_event(ndev, event); break; default: break; } return NOTIFY_DONE; } static struct notifier_block nb_netdevice = { .notifier_call = ib_netdevice_event, }; static int __init ib_core_init(void) { int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!ib_unreg_wq) goto err; ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!ib_comp_wq) goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); if (!ib_comp_unbound_wq) goto err_comp; ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); goto err_comp_unbound; } rdma_nl_init(); ret = addr_init(); if (ret) { pr_warn("Couldn't init IB address resolution\n"); goto err_ibnl; } ret = ib_mad_init(); if (ret) { pr_warn("Couldn't init IB MAD\n"); goto err_addr; } ret = ib_sa_init(); if (ret) { pr_warn("Couldn't init SA\n"); goto err_mad; } ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); goto err_sa; } ret = register_pernet_device(&rdma_dev_net_ops); if (ret) { pr_warn("Couldn't init compat dev. ret %d\n", ret); goto err_compat; } nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ret = roce_gid_mgmt_init(); if (ret) { pr_warn("Couldn't init RoCE GID management\n"); goto err_parent; } register_netdevice_notifier(&nb_netdevice); return 0; err_parent: rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); err_compat: unregister_blocking_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: ib_mad_cleanup(); err_addr: addr_cleanup(); err_ibnl: class_unregister(&ib_class); err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err_unbound: destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { unregister_netdevice_notifier(&nb_netdevice); roce_gid_mgmt_cleanup(); rdma_nl_unregister(RDMA_NL_LS); nldev_exit(); unregister_pernet_device(&rdma_dev_net_ops); unregister_blocking_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); /* ib core relies on netdev stack to first register net_ns_type_operations * ns kobject type before ib_core initialization. */ fs_initcall(ib_core_init); module_exit(ib_core_cleanup);
283 283 235 294 294 289 283 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0 /* * VMID allocator. * * Based on Arm64 ASID allocator algorithm. * Please refer arch/arm64/mm/context.c for detailed * comments on algorithm. * * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved. * Copyright (C) 2012 ARM Ltd. */ #include <linux/bitfield.h> #include <linux/bitops.h> #include <asm/kvm_asm.h> #include <asm/kvm_mmu.h> unsigned int __ro_after_init kvm_arm_vmid_bits; static DEFINE_RAW_SPINLOCK(cpu_vmid_lock); static atomic64_t vmid_generation; static unsigned long *vmid_map; static DEFINE_PER_CPU(atomic64_t, active_vmids); static DEFINE_PER_CPU(u64, reserved_vmids); #define VMID_MASK (~GENMASK(kvm_arm_vmid_bits - 1, 0)) #define VMID_FIRST_VERSION (1UL << kvm_arm_vmid_bits) #define NUM_USER_VMIDS VMID_FIRST_VERSION #define vmid2idx(vmid) ((vmid) & ~VMID_MASK) #define idx2vmid(idx) vmid2idx(idx) /* * As vmid #0 is always reserved, we will never allocate one * as below and can be treated as invalid. This is used to * set the active_vmids on vCPU schedule out. */ #define VMID_ACTIVE_INVALID VMID_FIRST_VERSION #define vmid_gen_match(vmid) \ (!(((vmid) ^ atomic64_read(&vmid_generation)) >> kvm_arm_vmid_bits)) static void flush_context(void) { int cpu; u64 vmid; bitmap_zero(vmid_map, NUM_USER_VMIDS); for_each_possible_cpu(cpu) { vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0); /* Preserve reserved VMID */ if (vmid == 0) vmid = per_cpu(reserved_vmids, cpu); __set_bit(vmid2idx(vmid), vmid_map); per_cpu(reserved_vmids, cpu) = vmid; } /* * Unlike ASID allocator, we expect less frequent rollover in * case of VMIDs. Hence, instead of marking the CPU as * flush_pending and issuing a local context invalidation on * the next context-switch, we broadcast TLB flush + I-cache * invalidation over the inner shareable domain on rollover. */ kvm_call_hyp(__kvm_flush_vm_context); } static bool check_update_reserved_vmid(u64 vmid, u64 newvmid) { int cpu; bool hit = false; /* * Iterate over the set of reserved VMIDs looking for a match * and update to use newvmid (i.e. the same VMID in the current * generation). */ for_each_possible_cpu(cpu) { if (per_cpu(reserved_vmids, cpu) == vmid) { hit = true; per_cpu(reserved_vmids, cpu) = newvmid; } } return hit; } static u64 new_vmid(struct kvm_vmid *kvm_vmid) { static u32 cur_idx = 1; u64 vmid = atomic64_read(&kvm_vmid->id); u64 generation = atomic64_read(&vmid_generation); if (vmid != 0) { u64 newvmid = generation | (vmid & ~VMID_MASK); if (check_update_reserved_vmid(vmid, newvmid)) { atomic64_set(&kvm_vmid->id, newvmid); return newvmid; } if (!__test_and_set_bit(vmid2idx(vmid), vmid_map)) { atomic64_set(&kvm_vmid->id, newvmid); return newvmid; } } vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, cur_idx); if (vmid != NUM_USER_VMIDS) goto set_vmid; /* We're out of VMIDs, so increment the global generation count */ generation = atomic64_add_return_relaxed(VMID_FIRST_VERSION, &vmid_generation); flush_context(); /* We have more VMIDs than CPUs, so this will always succeed */ vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, 1); set_vmid: __set_bit(vmid, vmid_map); cur_idx = vmid; vmid = idx2vmid(vmid) | generation; atomic64_set(&kvm_vmid->id, vmid); return vmid; } /* Called from vCPU sched out with preemption disabled */ void kvm_arm_vmid_clear_active(void) { atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); } void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) { unsigned long flags; u64 vmid, old_active_vmid; vmid = atomic64_read(&kvm_vmid->id); /* * Please refer comments in check_and_switch_context() in * arch/arm64/mm/context.c. * * Unlike ASID allocator, we set the active_vmids to * VMID_ACTIVE_INVALID on vCPU schedule out to avoid * reserving the VMID space needlessly on rollover. * Hence explicitly check here for a "!= 0" to * handle the sync with a concurrent rollover. */ old_active_vmid = atomic64_read(this_cpu_ptr(&active_vmids)); if (old_active_vmid != 0 && vmid_gen_match(vmid) && 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), old_active_vmid, vmid)) return; raw_spin_lock_irqsave(&cpu_vmid_lock, flags); /* Check that our VMID belongs to the current generation. */ vmid = atomic64_read(&kvm_vmid->id); if (!vmid_gen_match(vmid)) vmid = new_vmid(kvm_vmid); atomic64_set(this_cpu_ptr(&active_vmids), vmid); raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); } /* * Initialize the VMID allocator */ int __init kvm_arm_vmid_alloc_init(void) { kvm_arm_vmid_bits = kvm_get_vmid_bits(); /* * Expect allocation after rollover to fail if we don't have * at least one more VMID than CPUs. VMID #0 is always reserved. */ WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus()); atomic64_set(&vmid_generation, VMID_FIRST_VERSION); vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL); if (!vmid_map) return -ENOMEM; return 0; } void __init kvm_arm_vmid_alloc_free(void) { bitmap_free(vmid_map); }
8 5 3 3 2 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The Internet Protocol (IP) module. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Richard Underwood * Stefan Becker, <stefanb@yello.ping.de> * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code * (just stops a compiler warning). * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes * are junked rather than corrupting things. * Alan Cox : Frames to bad broadcast subnets are dumped * We used to process them non broadcast and * boy could that cause havoc. * Alan Cox : ip_forward sets the free flag on the * new frame it queues. Still crap because * it copies the frame but at least it * doesn't eat memory too. * Alan Cox : Generic queue code and memory fixes. * Fred Van Kempen : IP fragment support (borrowed from NET2E) * Gerhard Koerting: Forward fragmented frames correctly. * Gerhard Koerting: Fixes to my fix of the above 8-). * Gerhard Koerting: IP interface addressing fix. * Linus Torvalds : More robustness checks * Alan Cox : Even more checks: Still not as robust as it ought to be * Alan Cox : Save IP header pointer for later * Alan Cox : ip option setting * Alan Cox : Use ip_tos/ip_ttl settings * Alan Cox : Fragmentation bogosity removed * (Thanks to Mark.Bush@prg.ox.ac.uk) * Dmitry Gorodchanin : Send of a raw packet crash fix. * Alan Cox : Silly ip bug when an overlength * fragment turns up. Now frees the * queue. * Linus Torvalds/ : Memory leakage on fragmentation * Alan Cox : handling. * Gerhard Koerting: Forwarding uses IP priority hints * Teemu Rantanen : Fragment problems. * Alan Cox : General cleanup, comments and reformat * Alan Cox : SNMP statistics * Alan Cox : BSD address rule semantics. Also see * UDP as there is a nasty checksum issue * if you do things the wrong way. * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file * Alan Cox : IP options adjust sk->priority. * Pedro Roque : Fix mtu/length error in ip_forward. * Alan Cox : Avoid ip_chk_addr when possible. * Richard Underwood : IP multicasting. * Alan Cox : Cleaned up multicast handlers. * Alan Cox : RAW sockets demultiplex in the BSD style. * Gunther Mayer : Fix the SNMP reporting typo * Alan Cox : Always in group 224.0.0.1 * Pauline Middelink : Fast ip_checksum update when forwarding * Masquerading support. * Alan Cox : Multicast loopback error for 224.0.0.1 * Alan Cox : IP_MULTICAST_LOOP option. * Alan Cox : Use notifiers. * Bjorn Ekwall : Removed ip_csum (from slhc.c too) * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) * Stefan Becker : Send out ICMP HOST REDIRECT * Arnt Gulbrandsen : ip_build_xmit * Alan Cox : Per socket routing cache * Alan Cox : Fixed routing cache, added header cache. * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. * Alan Cox : Incoming IP option handling. * Alan Cox : Set saddr on raw output frames as per BSD. * Alan Cox : Stopped broadcast source route explosions. * Alan Cox : Can disable source routing * Takeshi Sone : Masquerading didn't work. * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. * Alan Cox : Memory leaks, tramples, misc debugging. * Alan Cox : Fixed multicast (by popular demand 8)) * Alan Cox : Fixed forwarding (by even more popular demand 8)) * Alan Cox : Fixed SNMP statistics [I think] * Gerhard Koerting : IP fragmentation forwarding fix * Alan Cox : Device lock against page fault. * Alan Cox : IP_HDRINCL facility. * Werner Almesberger : Zero fragment bug * Alan Cox : RAW IP frame length bug * Alan Cox : Outgoing firewall on build_xmit * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel * Alan Cox : Multicast routing hooks * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit * the allocation of a buffer that can then be 'grown' by twiddling page tables. * Output fragmentation wants updating along with the buffer management to use a single * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/arp.h> #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> #include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) */ bool ip_call_ra_chain(struct sk_buff *skb) { struct ip_ra_chain *ra; u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; struct net *net = dev_net(dev); for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report * the packet if it came from that interface. */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) raw_rcv(last, skb2); } last = sk; } } if (last) { raw_rcv(last, skb); return true; } return false; } INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { const struct net_protocol *ipprot; int raw, ret; resubmit: raw = raw_local_deliver(skb, protocol); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot) { if (!ipprot->no_policy) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY); return; } nf_reset_ct(skb); } ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, skb); if (ret < 0) { protocol = -ret; goto resubmit; } __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } } static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); __skb_pull(skb, skb_network_header_len(skb)); rcu_read_lock(); ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; } /* * Deliver IP Packets to the higher protocol layers. */ int ip_local_deliver(struct sk_buff *skb) { /* * Reassemble IP fragments. */ struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } EXPORT_SYMBOL(ip_local_deliver); static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { struct ip_options *opt; const struct iphdr *iph; /* It looks as overkill, because not all IP options require packet mangling. But it is the easiest for now, especially taking into account that combination of IP options and running sniffer is extremely rare condition. --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } if (unlikely(opt->srr)) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { if (!IN_DEV_SOURCE_ROUTE(in_dev)) { if (IN_DEV_LOG_MARTIANS(in_dev)) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); goto drop; } } if (ip_options_rcv_srr(skb, dev)) goto drop; } return false; drop: return true; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, const struct sk_buff *hint) { return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && ip_hdr(hint)->tos == iph->tos; } int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); int err, drop_reason; struct rtable *rt; if (ip_can_use_hint(skb, iph, hint)) { drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev, hint); if (unlikely(drop_reason)) goto drop_error; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_TCP: if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { tcp_v4_early_demux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; case IPPROTO_UDP: if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { err = udp_v4_early_demux(skb); if (unlikely(err)) goto drop_error; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } break; } } /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ if (!skb_valid_dst(skb)) { drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; st[idx&0xFF].o_packets++; st[idx&0xFF].o_bytes += skb->len; st[(idx>>16)&0xFF].i_packets++; st[(idx>>16)&0xFF].i_bytes += skb->len; } #endif if (iph->ihl > 5 && ip_rcv_options(skb, dev)) goto drop; rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(dev); /* RFC 1122 3.3.6: * * When a host sends a datagram to a link-layer broadcast * address, the IP destination address MUST be a legal IP * broadcast or IP multicast address. * * A host SHOULD silently discard a datagram that is received * via a link-layer broadcast (see Section 2.4) but does not * specify an IP multicast or broadcast destination address. * * This doesn't explicitly say L2 *broadcast*, but broadcast is * in a way a form of multicast and the most common use case for * this is 802.11 protecting against cross-station spoofing (the * so-called "hole-196" attack) so do it for both. */ if (in_dev && IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; goto drop; } } return NET_RX_SUCCESS; drop: kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; drop_error: if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) return NET_RX_SUCCESS; ret = ip_rcv_finish_core(net, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; } /* * Main IP Receive routine. */ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) { dev_core_stats_rx_otherhost_dropped_inc(skb->dev); drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; } __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; iph = ip_hdr(skb); /* * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. * * Is the datagram acceptable? * * 1. Length at least the size of an ip header * 2. Version of 4 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] * 4. Doesn't have a bogus length */ if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); __IP_ADD_STATS(net, IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; iph = ip_hdr(skb); if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; len = iph_totlen(skb, iph); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it * is IP we can trim to the true length of the frame. * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ if (!skb_sk_is_prefetched(skb)) skb_orphan(skb); return skb; csum_error: drop_reason = SKB_DROP_REASON_IP_CSUM; __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) drop_reason = SKB_DROP_REASON_IP_INHDR; __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb_reason(skb, drop_reason); out: return NULL; } /* * IP receive entry point */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct net *net = dev_net(dev); skb = ip_rcv_core(skb, net); if (skb == NULL) return NET_RX_DROP; return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, skb, dev, NULL, ip_rcv_finish); } static void ip_sublist_rcv_finish(struct list_head *head) { struct sk_buff *skb, *next; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); dst_input(skb); } } static struct sk_buff *ip_extract_route_hint(const struct net *net, struct sk_buff *skb, int rt_type) { if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; return skb; } static void ip_list_rcv_finish(struct net *net, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip_rcv(skb); if (!skb) continue; if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); if (curr_dst != dst) { hint = ip_extract_route_hint(net, skb, dst_rtable(dst)->rt_type); /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv_finish(&sublist); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dst = dst; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ ip_sublist_rcv_finish(&sublist); } static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, struct net *net) { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); ip_list_rcv_finish(net, head); } /* Receive a list of IP packets */ void ip_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev) { struct net_device *curr_dev = NULL; struct net *curr_net = NULL; struct sk_buff *skb, *next; LIST_HEAD(sublist); list_for_each_entry_safe(skb, next, head, list) { struct net_device *dev = skb->dev; struct net *net = dev_net(dev); skb_list_del_init(skb); skb = ip_rcv_core(skb, net); if (skb == NULL) continue; if (curr_dev != dev || curr_net != net) { /* dispatch old sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); /* start new sublist */ INIT_LIST_HEAD(&sublist); curr_dev = dev; curr_net = net; } list_add_tail(&skb->list, &sublist); } /* dispatch final sublist */ if (!list_empty(&sublist)) ip_sublist_rcv(&sublist, curr_dev, curr_net); }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/workqueue.h> #include <linux/spinlock.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> #include <net/inet_dscp.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_flow_table.h> struct nft_flow_offload { struct nft_flowtable *flowtable; }; static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) { if (dst_xfrm(dst)) return FLOW_OFFLOAD_XMIT_XFRM; return FLOW_OFFLOAD_XMIT_NEIGH; } static void nft_default_forward_path(struct nf_flow_route *route, struct dst_entry *dst_cache, enum ip_conntrack_dir dir) { route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; route->tuple[dir].dst = dst_cache; route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); } static bool nft_is_valid_ether_device(const struct net_device *dev) { if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return false; return true; } static int nft_dev_fill_forward_path(const struct nf_flow_route *route, const struct dst_entry *dst_cache, const struct nf_conn *ct, enum ip_conntrack_dir dir, u8 *ha, struct net_device_path_stack *stack) { const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; struct net_device *dev = dst_cache->dev; struct neighbour *n; u8 nud_state; if (!nft_is_valid_ether_device(dev)) goto out; n = dst_neigh_lookup(dst_cache, daddr); if (!n) return -1; read_lock_bh(&n->lock); nud_state = n->nud_state; ether_addr_copy(ha, n->ha); read_unlock_bh(&n->lock); neigh_release(n); if (!(nud_state & NUD_VALID)) return -1; out: return dev_fill_forward_path(dev, ha, stack); } struct nft_forward_info { const struct net_device *indev; const struct net_device *outdev; const struct net_device *hw_outdev; struct id { __u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; u8 num_encaps; u8 ingress_vlans; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; enum flow_offload_xmit_type xmit_type; }; static void nft_dev_path_info(const struct net_device_path_stack *stack, struct nft_forward_info *info, unsigned char *ha, struct nf_flowtable *flowtable) { const struct net_device_path *path; int i; memcpy(info->h_dest, ha, ETH_ALEN); for (i = 0; i < stack->num_paths; i++) { path = &stack->path[i]; switch (path->type) { case DEV_PATH_ETHERNET: case DEV_PATH_DSA: case DEV_PATH_VLAN: case DEV_PATH_PPPOE: info->indev = path->dev; if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); if (path->type == DEV_PATH_ETHERNET) break; if (path->type == DEV_PATH_DSA) { i = stack->num_paths; break; } /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { info->indev = NULL; break; } if (!info->outdev) info->outdev = path->dev; info->encap[info->num_encaps].id = path->encap.id; info->encap[info->num_encaps].proto = path->encap.proto; info->num_encaps++; if (path->type == DEV_PATH_PPPOE) memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); break; case DEV_PATH_BRIDGE: if (is_zero_ether_addr(info->h_source)) memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); switch (path->bridge.vlan_mode) { case DEV_PATH_BR_VLAN_UNTAG_HW: info->ingress_vlans |= BIT(info->num_encaps - 1); break; case DEV_PATH_BR_VLAN_TAG: info->encap[info->num_encaps].id = path->bridge.vlan_id; info->encap[info->num_encaps].proto = path->bridge.vlan_proto; info->num_encaps++; break; case DEV_PATH_BR_VLAN_UNTAG: info->num_encaps--; break; case DEV_PATH_BR_VLAN_KEEP: break; } info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; break; default: info->indev = NULL; break; } } if (!info->outdev) info->outdev = info->indev; info->hw_outdev = info->indev; if (nf_flowtable_hw_offload(flowtable) && nft_is_valid_ether_device(info->indev)) info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; } static bool nft_flowtable_find_dev(const struct net_device *dev, struct nft_flowtable *ft) { struct nft_hook *hook; bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { if (!nft_hook_find_ops_rcu(hook, dev)) continue; found = true; break; } return found; } static void nft_dev_forward_path(struct nf_flow_route *route, const struct nf_conn *ct, enum ip_conntrack_dir dir, struct nft_flowtable *ft) { const struct dst_entry *dst = route->tuple[dir].dst; struct net_device_path_stack stack; struct nft_forward_info info = {}; unsigned char ha[ETH_ALEN]; int i; if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) nft_dev_path_info(&stack, &info, ha, &ft->data); if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) return; route->tuple[!dir].in.ifindex = info.indev->ifindex; for (i = 0; i < info.num_encaps; i++) { route->tuple[!dir].in.encap[i].id = info.encap[i].id; route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; } route->tuple[!dir].in.num_encaps = info.num_encaps; route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); route->tuple[dir].out.ifindex = info.outdev->ifindex; route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; route->tuple[dir].xmit_type = info.xmit_type; } } static int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, struct nf_flow_route *route, enum ip_conntrack_dir dir, struct nft_flowtable *ft) { struct dst_entry *this_dst = skb_dst(pkt->skb); struct dst_entry *other_dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(fl)); switch (nft_pf(pkt)) { case NFPROTO_IPV4: fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; case NFPROTO_IPV6: fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); fl.u.ip6.flowi6_mark = pkt->skb->mark; fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; break; } if (!dst_hold_safe(this_dst)) return -ENOENT; nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); if (!other_dst) { dst_release(this_dst); return -ENOENT; } nft_default_forward_path(route, this_dst, dir); nft_default_forward_path(route, other_dst, !dir); if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { nft_dev_forward_path(route, ct, dir, ft); nft_dev_forward_path(route, ct, !dir, ft); } return 0; } static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) return true; if (family == NFPROTO_IPV4) { const struct ip_options *opt; opt = &(IPCB(skb)->opt); if (unlikely(opt->optlen)) return true; } return false; } static void flow_offload_ct_tcp(struct nf_conn *ct) { /* conntrack will not see all packets, disable tcp window validation. */ spin_lock_bh(&ct->lock); ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; spin_unlock_bh(&ct->lock); } static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; struct tcphdr _tcph, *tcph = NULL; struct nf_flow_route route = {}; enum ip_conntrack_info ctinfo; struct flow_offload *flow; enum ip_conntrack_dir dir; struct nf_conn *ct; int ret; if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt))) goto out; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct) goto out; switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt), sizeof(_tcph), &_tcph); if (unlikely(!tcph || tcph->fin || tcph->rst || !nf_conntrack_tcp_established(ct))) goto out; break; case IPPROTO_UDP: break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; if (ct->status & IPS_NAT_MASK) goto out; tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) goto out; break; } #endif default: goto out; } if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) goto out; if (!nf_ct_is_confirmed(ct)) goto out; if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) goto out; dir = CTINFO2DIR(ctinfo); if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0) goto err_flow_route; flow = flow_offload_alloc(ct); if (!flow) goto err_flow_alloc; flow_offload_route_init(flow, &route); if (tcph) flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; return; err_flow_add: flow_offload_free(flow); err_flow_alloc: dst_release(route.tuple[dir].dst); dst_release(route.tuple[!dir].dst); err_flow_route: clear_bit(IPS_OFFLOAD_BIT, &ct->status); out: regs->verdict.code = NFT_BREAK; } static int nft_flow_offload_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { unsigned int hook_mask = (1 << NF_INET_FORWARD); if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, hook_mask); } static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, .len = NFT_NAME_MAXLEN - 1 }, }; static int nft_flow_offload_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_flow_offload *priv = nft_expr_priv(expr); u8 genmask = nft_genmask_next(ctx->net); struct nft_flowtable *flowtable; if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; flowtable = nft_flowtable_lookup(ctx->net, ctx->table, tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); if (!nft_use_inc(&flowtable->use)) return -EMFILE; priv->flowtable = flowtable; return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { struct nft_flow_offload *priv = nft_expr_priv(expr); nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase); } static void nft_flow_offload_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_flow_offload *priv = nft_expr_priv(expr); nft_use_inc_restore(&priv->flowtable->use); } static void nft_flow_offload_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_flow_offload_type; static const struct nft_expr_ops nft_flow_offload_ops = { .type = &nft_flow_offload_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), .eval = nft_flow_offload_eval, .init = nft_flow_offload_init, .activate = nft_flow_offload_activate, .deactivate = nft_flow_offload_deactivate, .destroy = nft_flow_offload_destroy, .validate = nft_flow_offload_validate, .dump = nft_flow_offload_dump, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_flow_offload_type __read_mostly = { .name = "flow_offload", .ops = &nft_flow_offload_ops, .policy = nft_flow_offload_policy, .maxattr = NFTA_FLOW_MAX, .owner = THIS_MODULE, }; static int flow_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_DOWN) return NOTIFY_DONE; nf_flow_table_cleanup(dev); return NOTIFY_DONE; } static struct notifier_block flow_offload_netdev_notifier = { .notifier_call = flow_offload_netdev_event, }; static int __init nft_flow_offload_module_init(void) { int err; err = register_netdevice_notifier(&flow_offload_netdev_notifier); if (err) goto err; err = nft_register_expr(&nft_flow_offload_type); if (err < 0) goto register_expr; return 0; register_expr: unregister_netdevice_notifier(&flow_offload_netdev_notifier); err: return err; } static void __exit nft_flow_offload_module_exit(void) { nft_unregister_expr(&nft_flow_offload_type); unregister_netdevice_notifier(&flow_offload_netdev_notifier); } module_init(nft_flow_offload_module_init); module_exit(nft_flow_offload_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_ALIAS_NFT_EXPR("flow_offload"); MODULE_DESCRIPTION("nftables hardware flow offload module");
1 1 7 1 1 1 1 1 1 1 28 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef TUN_VNET_H #define TUN_VNET_H /* High bits in flags field are unused. */ #define TUN_VNET_LE 0x80000000 #define TUN_VNET_BE 0x40000000 static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags) { bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) && (flags & TUN_VNET_BE); return !be && virtio_legacy_is_little_endian(); } static inline long tun_get_vnet_be(unsigned int flags, int __user *argp) { int be = !!(flags & TUN_VNET_BE); if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) return -EINVAL; if (put_user(be, argp)) return -EFAULT; return 0; } static inline long tun_set_vnet_be(unsigned int *flags, int __user *argp) { int be; if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE)) return -EINVAL; if (get_user(be, argp)) return -EFAULT; if (be) *flags |= TUN_VNET_BE; else *flags &= ~TUN_VNET_BE; return 0; } static inline bool tun_vnet_is_little_endian(unsigned int flags) { return flags & TUN_VNET_LE || tun_vnet_legacy_is_little_endian(flags); } static inline u16 tun_vnet16_to_cpu(unsigned int flags, __virtio16 val) { return __virtio16_to_cpu(tun_vnet_is_little_endian(flags), val); } static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val) { return __cpu_to_virtio16(tun_vnet_is_little_endian(flags), val); } static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags, unsigned int cmd, int __user *sp) { int s; switch (cmd) { case TUNGETVNETHDRSZ: s = *vnet_hdr_sz; if (put_user(s, sp)) return -EFAULT; return 0; case TUNSETVNETHDRSZ: if (get_user(s, sp)) return -EFAULT; if (s < (int)sizeof(struct virtio_net_hdr)) return -EINVAL; *vnet_hdr_sz = s; return 0; case TUNGETVNETLE: s = !!(*flags & TUN_VNET_LE); if (put_user(s, sp)) return -EFAULT; return 0; case TUNSETVNETLE: if (get_user(s, sp)) return -EFAULT; if (s) *flags |= TUN_VNET_LE; else *flags &= ~TUN_VNET_LE; return 0; case TUNGETVNETBE: return tun_get_vnet_be(*flags, sp); case TUNSETVNETBE: return tun_set_vnet_be(flags, sp); default: return -EINVAL; } } static inline int tun_vnet_hdr_get(int sz, unsigned int flags, struct iov_iter *from, struct virtio_net_hdr *hdr) { u16 hdr_len; if (iov_iter_count(from) < sz) return -EINVAL; if (!copy_from_iter_full(hdr, sizeof(*hdr), from)) return -EFAULT; hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len); if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { hdr_len = max(tun_vnet16_to_cpu(flags, hdr->csum_start) + tun_vnet16_to_cpu(flags, hdr->csum_offset) + 2, hdr_len); hdr->hdr_len = cpu_to_tun_vnet16(flags, hdr_len); } if (hdr_len > iov_iter_count(from)) return -EINVAL; iov_iter_advance(from, sz - sizeof(*hdr)); return hdr_len; } static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter, const struct virtio_net_hdr *hdr) { if (unlikely(iov_iter_count(iter) < sz)) return -EINVAL; if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr))) return -EFAULT; if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr)) return -EFAULT; return 0; } static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb, const struct virtio_net_hdr *hdr) { return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags)); } static inline int tun_vnet_hdr_from_skb(unsigned int flags, const struct net_device *dev, const struct sk_buff *skb, struct virtio_net_hdr *hdr) { int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0; if (virtio_net_hdr_from_skb(skb, hdr, tun_vnet_is_little_endian(flags), true, vlan_hlen)) { struct skb_shared_info *sinfo = skb_shinfo(skb); if (net_ratelimit()) { netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n", sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size), tun_vnet16_to_cpu(flags, hdr->hdr_len)); print_hex_dump(KERN_ERR, "tun: ", DUMP_PREFIX_NONE, 16, 1, skb->head, min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true); } WARN_ON_ONCE(1); return -EINVAL; } return 0; } #endif /* TUN_VNET_H */
15 286 286 286 286 43 15 15 20 20 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/container_of.h> #include <linux/rbtree_types.h> #include <linux/stddef.h> #include <linux/rcupdate.h> #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline struct rb_node * rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { struct rb_node *leftmost = NULL; if (root->rb_leftmost == node) leftmost = root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } /* * The below helper functions use 2 operators with 3 different * calling conventions. The operators are related like: * * comp(a->key,b) < 0 := less(a,b) * comp(a->key,b) > 0 := less(b,a) * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) * * If these operators define a partial order on the elements we make no * guarantee on which of the elements matching the key is found. See * rb_find(). * * The reason for this is to allow the find() interface without requiring an * on-stack dummy object, which might not be feasible due to object size. */ /** * rb_add_cached() - insert @node into the leftmost cached tree @tree * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order * * Returns @node when it is the new leftmost, or NULL. */ static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; bool leftmost = true; while (*link) { parent = *link; if (less(node, parent)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = false; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return leftmost ? node : NULL; } /** * rb_add() - insert @node into @tree * @node: node to insert * @tree: tree to insert @node into * @less: operator defining the (partial) node order */ static __always_inline void rb_add(struct rb_node *node, struct rb_root *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; while (*link) { parent = *link; if (less(node, parent)) link = &parent->rb_left; else link = &parent->rb_right; } rb_link_node(node, parent, link); rb_insert_color(node, tree); } /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree, int (*cmp)(const struct rb_node *new, const struct rb_node *exist)) { bool leftmost = true; struct rb_node **link = &tree->rb_root.rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) { link = &parent->rb_left; } else if (c > 0) { link = &parent->rb_right; leftmost = false; } else { return parent; } } rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); return NULL; } /** * rb_find_add() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find_add_rcu() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert * @tree: tree to search / modify * @cmp: operator defining the node order * * Adds a Store-Release for link_node. * * Returns the rb_node matching @node, or NULL when no match is found and @node * is inserted. */ static __always_inline struct rb_node * rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, int (*cmp)(struct rb_node *, const struct rb_node *)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; int c; while (*link) { parent = *link; c = cmp(node, parent); if (c < 0) link = &parent->rb_left; else if (c > 0) link = &parent->rb_right; else return parent; } rb_link_node_rcu(node, parent, link); rb_insert_color(node, tree); return NULL; } /** * rb_find() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = node->rb_left; else if (c > 0) node = node->rb_right; else return node; } return NULL; } /** * rb_find_rcu() - find @key in tree @tree * @key: key to match * @tree: tree to search * @cmp: operator defining the node order * * Notably, tree descent vs concurrent tree rotations is unsound and can result * in false-negatives. * * Returns the rb_node matching @key or NULL. */ static __always_inline struct rb_node * rb_find_rcu(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; while (node) { int c = cmp(key, node); if (c < 0) node = rcu_dereference_raw(node->rb_left); else if (c > 0) node = rcu_dereference_raw(node->rb_right); else return node; } return NULL; } /** * rb_find_first() - find the first @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the leftmost node matching @key, or NULL. */ static __always_inline struct rb_node * rb_find_first(const void *key, const struct rb_root *tree, int (*cmp)(const void *key, const struct rb_node *)) { struct rb_node *node = tree->rb_node; struct rb_node *match = NULL; while (node) { int c = cmp(key, node); if (c <= 0) { if (!c) match = node; node = node->rb_left; } else if (c > 0) { node = node->rb_right; } } return match; } /** * rb_next_match() - find the next @key in @tree * @key: key to match * @tree: tree to search * @cmp: operator defining node order * * Returns the next node matching @key, or NULL. */ static __always_inline struct rb_node * rb_next_match(const void *key, struct rb_node *node, int (*cmp)(const void *key, const struct rb_node *)) { node = rb_next(node); if (node && cmp(key, node)) node = NULL; return node; } /** * rb_for_each() - iterates a subtree matching @key * @node: iterator * @key: key to match * @tree: tree to search * @cmp: operator defining node order */ #define rb_for_each(node, key, tree, cmp) \ for ((node) = rb_find_first((key), (tree), (cmp)); \ (node); (node) = rb_next_match((key), (node), (cmp))) #endif /* _LINUX_RBTREE_H */
222 232 191 191 194 194 314 313 314 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 // SPDX-License-Identifier: GPL-2.0-only /* * mm/interval_tree.c - interval tree for mapping->i_mmap * * Copyright (C) 2012, Michel Lespinasse <walken@google.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/rmap.h> #include <linux/interval_tree_generic.h> static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) { return v->vm_pgoff; } static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { return v->vm_pgoff + vma_pages(v) - 1; } INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, unsigned long, shared.rb_subtree_last, vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) /* Insert node immediately after prev in the interval tree */ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root) { struct rb_node **link; struct vm_area_struct *parent; unsigned long last = vma_last_pgoff(node); VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); if (!prev->shared.rb.rb_right) { parent = prev; link = &prev->shared.rb.rb_right; } else { parent = rb_entry(prev->shared.rb.rb_right, struct vm_area_struct, shared.rb); if (parent->shared.rb_subtree_last < last) parent->shared.rb_subtree_last = last; while (parent->shared.rb.rb_left) { parent = rb_entry(parent->shared.rb.rb_left, struct vm_area_struct, shared.rb); if (parent->shared.rb_subtree_last < last) parent->shared.rb_subtree_last = last; } link = &parent->shared.rb.rb_left; } node->shared.rb_subtree_last = last; rb_link_node(&node->shared.rb, &parent->shared.rb, link); rb_insert_augmented(&node->shared.rb, &root->rb_root, &vma_interval_tree_augment); } static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) { return vma_start_pgoff(avc->vma); } static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) { return vma_last_pgoff(avc->vma); } INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, avc_start_pgoff, avc_last_pgoff, static inline, __anon_vma_interval_tree) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root) { #ifdef CONFIG_DEBUG_VM_RB node->cached_vma_start = avc_start_pgoff(node); node->cached_vma_last = avc_last_pgoff(node); #endif __anon_vma_interval_tree_insert(node, root); } void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root) { __anon_vma_interval_tree_remove(node, root); } struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_first(root, first, last); } struct anon_vma_chain * anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, unsigned long first, unsigned long last) { return __anon_vma_interval_tree_iter_next(node, first, last); } #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node) { WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_FLS64_H_ #define _ASM_GENERIC_BITOPS_FLS64_H_ #include <asm/types.h> /** * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #if BITS_PER_LONG == 32 static __always_inline int fls64(__u64 x) { __u32 h = x >> 32; if (h) return fls(h) + 32; return fls(x); } #elif BITS_PER_LONG == 64 static __always_inline int fls64(__u64 x) { if (x == 0) return 0; return __fls(x) + 1; } #else #error BITS_PER_LONG not 32 or 64 #endif #endif /* _ASM_GENERIC_BITOPS_FLS64_H_ */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/netdevice.h> #include <linux/if.h> #include <linux/if_vlan.h> #include <net/udp_tunnel.h> #include <net/sch_generic.h> #include <linux/netfilter.h> #include <rdma/ib_addr.h> #include "rxe.h" #include "rxe_net.h" #include "rxe_loc.h" static struct rxe_recv_sockets recv_sockets; static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, struct net_device *ndev, struct in_addr *saddr, struct in_addr *daddr) { struct rtable *rt; struct flowi4 fl = { { 0 } }; memset(&fl, 0, sizeof(fl)); fl.flowi4_oif = ndev->ifindex; memcpy(&fl.saddr, saddr, sizeof(*saddr)); memcpy(&fl.daddr, daddr, sizeof(*daddr)); fl.flowi4_proto = IPPROTO_UDP; rt = ip_route_output_key(&init_net, &fl); if (IS_ERR(rt)) { rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); return NULL; } return &rt->dst; } #if IS_ENABLED(CONFIG_IPV6) static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { struct dst_entry *ndst; struct flowi6 fl6 = { { 0 } }; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = ndev->ifindex; memcpy(&fl6.saddr, saddr, sizeof(*saddr)); memcpy(&fl6.daddr, daddr, sizeof(*daddr)); fl6.flowi6_proto = IPPROTO_UDP; ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), recv_sockets.sk6->sk, &fl6, NULL); if (IS_ERR(ndst)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); return NULL; } if (unlikely(ndst->error)) { rxe_dbg_qp(qp, "no route to %pI6\n", daddr); goto put; } return ndst; put: dst_release(ndst); return NULL; } #else static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, struct net_device *ndev, struct in6_addr *saddr, struct in6_addr *daddr) { return NULL; } #endif static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_qp *qp, struct rxe_av *av) { struct dst_entry *dst = NULL; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); if (!dst || !dst_check(dst, qp->dst_cookie)) { if (dst) dst_release(dst); if (av->network_type == RXE_NETWORK_TYPE_IPV4) { struct in_addr *saddr; struct in_addr *daddr; saddr = &av->sgid_addr._sockaddr_in.sin_addr; daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route4(qp, ndev, saddr, daddr); } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { struct in6_addr *saddr6; struct in6_addr *daddr6; saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route6(qp, ndev, saddr6, daddr6); #if IS_ENABLED(CONFIG_IPV6) if (dst) qp->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); #endif } if (dst && (qp_type(qp) == IB_QPT_RC)) { dst_hold(dst); sk_dst_set(qp->sk->sk, dst); } } return dst; } static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { struct udphdr *udph; struct rxe_dev *rxe; struct net_device *ndev = skb->dev; struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); /* takes a reference on rxe->ib_dev * drop when skb is freed */ rxe = rxe_get_dev_from_net(ndev); if (!rxe && is_vlan_dev(ndev)) rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); if (!rxe) goto drop; if (skb_linearize(skb)) { ib_device_put(&rxe->ib_dev); goto drop; } udph = udp_hdr(skb); pkt->rxe = rxe; pkt->port_num = 1; pkt->hdr = (u8 *)(udph + 1); pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; drop: kfree_skb(skb); return 0; } static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, bool ipv6) { int err; struct socket *sock; struct udp_port_cfg udp_cfg = { }; struct udp_tunnel_sock_cfg tnl_cfg = { }; if (ipv6) { udp_cfg.family = AF_INET6; udp_cfg.ipv6_v6only = 1; } else { udp_cfg.family = AF_INET; } udp_cfg.local_udp_port = port; /* Create UDP socket */ err = udp_sock_create(net, &udp_cfg, &sock); if (err < 0) return ERR_PTR(err); tnl_cfg.encap_type = 1; tnl_cfg.encap_rcv = rxe_udp_encap_recv; /* Setup UDP tunnel */ setup_udp_tunnel_sock(net, sock, &tnl_cfg); return sock; } static void rxe_release_udp_tunnel(struct socket *sk) { if (sk) udp_tunnel_sock_release(sk); } static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port, __be16 dst_port) { struct udphdr *udph; __skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); udph = udp_hdr(skb); udph->dest = dst_port; udph->source = src_port; udph->len = htons(skb->len); udph->check = 0; } static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb, __be32 saddr, __be32 daddr, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { struct iphdr *iph; skb_scrub_packet(skb, xnet); skb_clear_hash(skb); skb_dst_set(skb, dst_clone(dst)); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); iph = ip_hdr(skb); iph->version = IPVERSION; iph->ihl = sizeof(struct iphdr) >> 2; iph->tot_len = htons(skb->len); iph->frag_off = df; iph->protocol = proto; iph->tos = tos; iph->daddr = daddr; iph->saddr = saddr; iph->ttl = ttl; __ip_select_ident(dev_net(dst->dev), iph, skb_shinfo(skb)->gso_segs ?: 1); } static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, struct in6_addr *saddr, struct in6_addr *daddr, __u8 proto, __u8 prio, __u8 ttl) { struct ipv6hdr *ip6h; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); skb_dst_set(skb, dst_clone(dst)); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, htonl(0)); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = proto; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; bool xnet = false; __be16 df = htons(IP_DF); struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); dst_release(dst); return 0; } static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; dst = rxe_find_route(skb->dev, qp, av); if (!dst) { rxe_dbg_qp(qp, "Host not reachable\n"); return -EHOSTUNREACH; } prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit); dst_release(dst); return 0; } int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err = 0; if (skb->protocol == htons(ETH_P_IP)) err = prepare4(av, pkt, skb); else if (skb->protocol == htons(ETH_P_IPV6)) err = prepare6(av, pkt, skb); if (ether_addr_equal(skb->dev->dev_addr, av->dmac)) pkt->mask |= RXE_LOOPBACK_MASK; return err; } static void rxe_skb_tx_dtor(struct sk_buff *skb) { struct net_device *ndev = skb->dev; struct rxe_dev *rxe; unsigned int qp_index; struct rxe_qp *qp; int skb_out; rxe = rxe_get_dev_from_net(ndev); if (!rxe && is_vlan_dev(ndev)) rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); if (WARN_ON(!rxe)) return; qp_index = (int)(uintptr_t)skb->sk->sk_user_data; if (!qp_index) return; qp = rxe_pool_get_index(&rxe->qp_pool, qp_index); if (!qp) goto put_dev; skb_out = atomic_dec_return(&qp->skb_out); if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW) rxe_sched_task(&qp->send_task); rxe_put(qp); put_dev: ib_device_put(&rxe->ib_dev); sock_put(skb->sk); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) { int err; struct sock *sk = pkt->qp->sk->sk; sock_hold(sk); skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); else err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); return err; } /* fix up a send packet to match the packets * received from UDP before looping them back */ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { struct sock *sk = pkt->qp->sk->sk; memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); sock_hold(sk); skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; atomic_inc(&pkt->qp->skb_out); if (skb->protocol == htons(ETH_P_IP)) skb_pull(skb, sizeof(struct iphdr)); else skb_pull(skb, sizeof(struct ipv6hdr)); if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) { kfree_skb(skb); return -EIO; } /* remove udp header */ skb_pull(skb, sizeof(struct udphdr)); rxe_rcv(skb); return 0; } int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct sk_buff *skb) { int err; int is_request = pkt->mask & RXE_REQ_MASK; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); if ((is_request && (qp_state(qp) < IB_QPS_RTS)) || (!is_request && (qp_state(qp) < IB_QPS_RTR))) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n"); goto drop; } spin_unlock_irqrestore(&qp->state_lock, flags); rxe_icrc_generate(skb, pkt); if (pkt->mask & RXE_LOOPBACK_MASK) err = rxe_loopback(skb, pkt); else err = rxe_send(skb, pkt); if (err) { rxe_counter_inc(rxe, RXE_CNT_SEND_ERR); return err; } rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); goto done; drop: kfree_skb(skb); err = 0; done: return err; } struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt) { unsigned int hdr_len; struct sk_buff *skb = NULL; struct net_device *ndev; const struct ib_gid_attr *attr; const int port_num = 1; attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); if (IS_ERR(attr)) return NULL; if (av->network_type == RXE_NETWORK_TYPE_IPV4) hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct iphdr); else hdr_len = ETH_HLEN + sizeof(struct udphdr) + sizeof(struct ipv6hdr); rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(attr); if (IS_ERR(ndev)) { rcu_read_unlock(); goto out; } skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), GFP_ATOMIC); if (unlikely(!skb)) { rcu_read_unlock(); goto out; } skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev)); /* FIXME: hold reference to this netdev until life of this skb. */ skb->dev = ndev; rcu_read_unlock(); if (av->network_type == RXE_NETWORK_TYPE_IPV4) skb->protocol = htons(ETH_P_IP); else skb->protocol = htons(ETH_P_IPV6); pkt->rxe = rxe; pkt->port_num = port_num; pkt->hdr = skb_put(skb, paylen); pkt->mask |= RXE_GRH_MASK; out: rdma_put_gid_attr(attr); return skb; } /* * this is required by rxe_cfg to match rxe devices in * /sys/class/infiniband up with their underlying ethernet devices */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { struct net_device *ndev; char *ndev_name; ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); if (!ndev) return NULL; ndev_name = ndev->name; dev_put(ndev); return ndev_name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) { int err; struct rxe_dev *rxe = NULL; rxe = ib_alloc_device(rxe_dev, ib_dev); if (!rxe) return -ENOMEM; ib_mark_name_assigned_by_user(&rxe->ib_dev); err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; } return 0; } static void rxe_port_event(struct rxe_dev *rxe, enum ib_event_type event) { struct ib_event ev; ev.device = &rxe->ib_dev; ev.element.port_num = 1; ev.event = event; ib_dispatch_event(&ev); } /* Caller must hold net_info_lock */ void rxe_port_up(struct rxe_dev *rxe) { rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); dev_info(&rxe->ib_dev.dev, "set active\n"); } /* Caller must hold net_info_lock */ void rxe_port_down(struct rxe_dev *rxe) { rxe_port_event(rxe, IB_EVENT_PORT_ERR); rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); dev_info(&rxe->ib_dev.dev, "set down\n"); } void rxe_set_port_state(struct rxe_dev *rxe) { struct net_device *ndev; ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); if (!ndev) return; if (ib_get_curr_port_state(ndev) == IB_PORT_ACTIVE) rxe_port_up(rxe); else rxe_port_down(rxe); dev_put(ndev); } static int rxe_notify(struct notifier_block *not_blk, unsigned long event, void *arg) { struct net_device *ndev = netdev_notifier_info_to_dev(arg); struct rxe_dev *rxe = rxe_get_dev_from_net(ndev); if (!rxe) return NOTIFY_OK; switch (event) { case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; case NETDEV_DOWN: case NETDEV_CHANGE: if (ib_get_curr_port_state(ndev) == IB_PORT_DOWN) rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); break; case NETDEV_REBOOT: case NETDEV_GOING_DOWN: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: default: rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n", event, ndev->name); break; } ib_device_put(&rxe->ib_dev); return NOTIFY_OK; } static struct notifier_block rxe_net_notifier = { .notifier_call = rxe_notify, }; static int rxe_net_ipv4_init(void) { recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), false); if (IS_ERR(recv_sockets.sk4)) { recv_sockets.sk4 = NULL; pr_err("Failed to create IPv4 UDP tunnel\n"); return -1; } return 0; } static int rxe_net_ipv6_init(void) { #if IS_ENABLED(CONFIG_IPV6) recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, htons(ROCE_V2_UDP_DPORT), true); if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { recv_sockets.sk6 = NULL; pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); return 0; } if (IS_ERR(recv_sockets.sk6)) { recv_sockets.sk6 = NULL; pr_err("Failed to create IPv6 UDP tunnel\n"); return -1; } #endif return 0; } void rxe_net_exit(void) { rxe_release_udp_tunnel(recv_sockets.sk6); rxe_release_udp_tunnel(recv_sockets.sk4); unregister_netdevice_notifier(&rxe_net_notifier); } int rxe_net_init(void) { int err; recv_sockets.sk6 = NULL; err = rxe_net_ipv4_init(); if (err) return err; err = rxe_net_ipv6_init(); if (err) goto err_out; err = register_netdevice_notifier(&rxe_net_notifier); if (err) { pr_err("Failed to register netdev notifier\n"); goto err_out; } return 0; err_out: rxe_net_exit(); return err; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 /* SPDX-License-Identifier: GPL-2.0 */ /* * Mutexes: blocking mutual exclusion locks * * started by Ingo Molnar: * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * * This file contains the main data structure and API definitions. */ #ifndef __LINUX_MUTEX_H #define __LINUX_MUTEX_H #include <asm/current.h> #include <linux/list.h> #include <linux/spinlock_types.h> #include <linux/lockdep.h> #include <linux/atomic.h> #include <asm/processor.h> #include <linux/osq_lock.h> #include <linux/debug_locks.h> #include <linux/cleanup.h> #include <linux/mutex_types.h> struct device; #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ , .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ } #else # define __DEP_MAP_MUTEX_INITIALIZER(lockname) #endif #ifdef CONFIG_DEBUG_MUTEXES # define __DEBUG_MUTEX_INITIALIZER(lockname) \ , .magic = &lockname extern void mutex_destroy(struct mutex *lock); #else # define __DEBUG_MUTEX_INITIALIZER(lockname) static inline void mutex_destroy(struct mutex *lock) {} #endif /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized * * Initialize the mutex to unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ \ __mutex_init((mutex), #mutex, &__key); \ } while (0) /** * mutex_init_with_key - initialize a mutex with a given lockdep key * @mutex: the mutex to be initialized * @key: the lockdep key to be associated with the mutex * * Initialize the mutex to the unlocked state. * * It is not allowed to initialize an already locked mutex. */ #define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key)) #ifndef CONFIG_PREEMPT_RT #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); /** * mutex_is_locked - is the mutex locked * @lock: the mutex to be queried * * Returns true if the mutex is locked, false if unlocked. */ extern bool mutex_is_locked(struct mutex *lock); #else /* !CONFIG_PREEMPT_RT */ /* * Preempt-RT variant based on rtmutexes. */ #define __MUTEX_INITIALIZER(mutexname) \ { \ .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex) \ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ } #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) extern void __mutex_rt_init(struct mutex *lock, const char *name, struct lock_class_key *key); #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) #define __mutex_init(mutex, name, key) \ do { \ rt_mutex_base_init(&(mutex)->rtmutex); \ __mutex_rt_init((mutex), name, key); \ } while (0) #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES int __devm_mutex_init(struct device *dev, struct mutex *lock); #else static inline int __devm_mutex_init(struct device *dev, struct mutex *lock) { /* * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so * no really need to register it in the devm subsystem. */ return 0; } #endif #define devm_mutex_init(dev, mutex) \ ({ \ typeof(mutex) mutex_ = (mutex); \ \ mutex_init(mutex_); \ __devm_mutex_init(dev, mutex_); \ }) /* * See kernel/locking/mutex.c for detailed documentation of these APIs. * Also see Documentation/locking/mutex-design.rst. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass); extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); extern int __must_check _mutex_lock_killable(struct mutex *lock, unsigned int subclass, struct lockdep_map *nest_lock); extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) #define mutex_lock_killable(lock) _mutex_lock_killable(lock, 0, NULL) #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) #define mutex_lock_nest_lock(lock, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ } while (0) #define mutex_lock_killable_nest_lock(lock, nest_lock) \ ( \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map), \ _mutex_lock_killable(lock, 0, &(nest_lock)->dep_map) \ ) #define mutex_lock_killable_nested(lock, subclass) \ _mutex_lock_killable(lock, subclass, NULL) #else extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); extern int __must_check mutex_lock_killable(struct mutex *lock); extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_killable_nest_lock(lock, nest_lock) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) # define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock) #endif /* * NOTE: mutex_trylock() follows the spin_trylock() convention, * not the down_trylock() convention! * * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern int _mutex_trylock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); #define mutex_trylock_nest_lock(lock, nest_lock) \ ( \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map), \ _mutex_trylock_nest_lock(lock, &(nest_lock)->dep_map) \ ) #define mutex_trylock(lock) _mutex_trylock_nest_lock(lock, NULL) #else extern int mutex_trylock(struct mutex *lock); #define mutex_trylock_nest_lock(lock, nest_lock) mutex_trylock(lock) #endif extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T)) DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T)) DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0) extern unsigned long mutex_get_owner(struct mutex *lock); #endif /* __LINUX_MUTEX_H */
32 33 33 33 33 33 33 33 32 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 32 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 32 33 33 33 33 33 33 33 33 33 33 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/signal.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/cache.h> #include <linux/compat.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/freezer.h> #include <linux/stddef.h> #include <linux/uaccess.h> #include <linux/sizes.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/rseq.h> #include <linux/syscalls.h> #include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/exception.h> #include <asm/cacheflush.h> #include <asm/gcs.h> #include <asm/ucontext.h> #include <asm/unistd.h> #include <asm/fpsimd.h> #include <asm/ptrace.h> #include <asm/syscall.h> #include <asm/signal32.h> #include <asm/traps.h> #include <asm/vdso.h> #define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK) /* * Do a signal return; undo the signal stack. These are aligned to 128-bit. */ struct rt_sigframe { struct siginfo info; struct ucontext uc; }; struct rt_sigframe_user_layout { struct rt_sigframe __user *sigframe; struct frame_record __user *next_frame; unsigned long size; /* size of allocated sigframe data */ unsigned long limit; /* largest allowed size */ unsigned long fpsimd_offset; unsigned long esr_offset; unsigned long gcs_offset; unsigned long sve_offset; unsigned long tpidr2_offset; unsigned long za_offset; unsigned long zt_offset; unsigned long fpmr_offset; unsigned long poe_offset; unsigned long extra_offset; unsigned long end_offset; }; /* * Holds any EL0-controlled state that influences unprivileged memory accesses. * This includes both accesses done in userspace and uaccess done in the kernel. * * This state needs to be carefully managed to ensure that it doesn't cause * uaccess to fail when setting up the signal frame, and the signal handler * itself also expects a well-defined state when entered. */ struct user_access_state { u64 por_el0; }; #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) /* * Save the user access state into ua_state and reset it to disable any * restrictions. */ static void save_reset_user_access_state(struct user_access_state *ua_state) { if (system_supports_poe()) { u64 por_enable_all = 0; for (int pkey = 0; pkey < arch_max_pkey(); pkey++) por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX); ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); write_sysreg_s(por_enable_all, SYS_POR_EL0); /* Ensure that any subsequent uaccess observes the updated value */ isb(); } } /* * Set the user access state for invoking the signal handler. * * No uaccess should be done after that function is called. */ static void set_handler_user_access_state(void) { if (system_supports_poe()) write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); } /* * Restore the user access state to the values saved in ua_state. * * No uaccess should be done after that function is called. */ static void restore_user_access_state(const struct user_access_state *ua_state) { if (system_supports_poe()) write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); } static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = sizeof(user->sigframe->uc.uc_mcontext.__reserved); memset(user, 0, sizeof(*user)); user->size = offsetof(struct rt_sigframe, uc.uc_mcontext.__reserved); user->limit = user->size + reserved_size; user->limit -= TERMINATOR_SIZE; user->limit -= EXTRA_CONTEXT_SIZE; /* Reserve space for extension and terminator ^ */ } static size_t sigframe_size(struct rt_sigframe_user_layout const *user) { return round_up(max(user->size, sizeof(struct rt_sigframe)), 16); } /* * Sanity limit on the approximate maximum size of signal frame we'll * try to generate. Stack alignment padding and the frame record are * not taken into account. This limit is not a guarantee and is * NOT ABI. */ #define SIGFRAME_MAXSZ SZ_256K static int __sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size, bool extend) { size_t padded_size = round_up(size, 16); if (padded_size > user->limit - user->size && !user->extra_offset && extend) { int ret; user->limit += EXTRA_CONTEXT_SIZE; ret = __sigframe_alloc(user, &user->extra_offset, sizeof(struct extra_context), false); if (ret) { user->limit -= EXTRA_CONTEXT_SIZE; return ret; } /* Reserve space for the __reserved[] terminator */ user->size += TERMINATOR_SIZE; /* * Allow expansion up to SIGFRAME_MAXSZ, ensuring space for * the terminator: */ user->limit = SIGFRAME_MAXSZ - TERMINATOR_SIZE; } /* Still not enough space? Bad luck! */ if (padded_size > user->limit - user->size) return -ENOMEM; *offset = user->size; user->size += padded_size; return 0; } /* * Allocate space for an optional record of <size> bytes in the user * signal frame. The offset from the signal frame base address to the * allocated block is assigned to *offset. */ static int sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size) { return __sigframe_alloc(user, offset, size, true); } /* Allocate the null terminator record and prevent further allocations */ static int sigframe_alloc_end(struct rt_sigframe_user_layout *user) { int ret; /* Un-reserve the space reserved for the terminator: */ user->limit += TERMINATOR_SIZE; ret = sigframe_alloc(user, &user->end_offset, sizeof(struct _aarch64_ctx)); if (ret) return ret; /* Prevent further allocation: */ user->limit = user->size; return 0; } static void __user *apply_user_offset( struct rt_sigframe_user_layout const *user, unsigned long offset) { char __user *base = (char __user *)user->sigframe; return base + offset; } struct user_ctxs { struct fpsimd_context __user *fpsimd; u32 fpsimd_size; struct sve_context __user *sve; u32 sve_size; struct tpidr2_context __user *tpidr2; u32 tpidr2_size; struct za_context __user *za; u32 za_size; struct zt_context __user *zt; u32 zt_size; struct fpmr_context __user *fpmr; u32 fpmr_size; struct poe_context __user *poe; u32 poe_size; struct gcs_context __user *gcs; u32 gcs_size; }; static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { struct user_fpsimd_state const *fpsimd = &current->thread.uw.fpsimd_state; int err; fpsimd_sync_from_effective_state(current); /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); __put_user_error(fpsimd->fpcr, &ctx->fpcr, err); /* copy the magic/size information */ __put_user_error(FPSIMD_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(struct fpsimd_context), &ctx->head.size, err); return err ? -EFAULT : 0; } static int read_fpsimd_context(struct user_fpsimd_state *fpsimd, struct user_ctxs *user) { int err; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs), sizeof(fpsimd->vregs)); __get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err); __get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err); return err ? -EFAULT : 0; } static int restore_fpsimd_context(struct user_ctxs *user) { struct user_fpsimd_state fpsimd; int err; err = read_fpsimd_context(&fpsimd, user); if (err) return err; clear_thread_flag(TIF_SVE); current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ fpsimd_update_current_state(&fpsimd); return 0; } static int preserve_fpmr_context(struct fpmr_context __user *ctx) { int err = 0; __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); return err; } static int restore_fpmr_context(struct user_ctxs *user) { u64 fpmr; int err = 0; if (user->fpmr_size != sizeof(*user->fpmr)) return -EINVAL; __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) current->thread.uw.fpmr = fpmr; return err; } static int preserve_poe_context(struct poe_context __user *ctx, const struct user_access_state *ua_state) { int err = 0; __put_user_error(POE_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(ua_state->por_el0, &ctx->por_el0, err); return err; } static int restore_poe_context(struct user_ctxs *user, struct user_access_state *ua_state) { u64 por_el0; int err = 0; if (user->poe_size != sizeof(*user->poe)) return -EINVAL; __get_user_error(por_el0, &(user->poe->por_el0), err); if (!err) ua_state->por_el0 = por_el0; return err; } #ifdef CONFIG_ARM64_SVE static int preserve_sve_context(struct sve_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; u16 flags = 0; unsigned int vl = task_get_sve_vl(current); unsigned int vq = 0; if (thread_sm_enabled(&current->thread)) { vl = task_get_sme_vl(current); vq = sve_vq_from_vl(vl); flags |= SVE_SIG_FLAG_SM; } else if (current->thread.fp_type == FP_STATE_SVE) { vq = sve_vq_from_vl(vl); } memset(reserved, 0, sizeof(reserved)); __put_user_error(SVE_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); __put_user_error(flags, &ctx->flags, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); } return err ? -EFAULT : 0; } static int restore_sve_fpsimd_context(struct user_ctxs *user) { int err = 0; unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; bool sm; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; __get_user_error(user_vl, &(user->sve->vl), err); __get_user_error(flags, &(user->sve->flags), err); if (err) return err; sm = flags & SVE_SIG_FLAG_SM; if (sm) { if (!system_supports_sme()) return -EINVAL; vl = task_get_sme_vl(current); } else { /* * A SME only system use SVE for streaming mode so can * have a SVE formatted context with a zero VL and no * payload data. */ if (!system_supports_sve() && !system_supports_sme()) return -EINVAL; vl = task_get_sve_vl(current); } if (user_vl != vl) return -EINVAL; /* * Non-streaming SVE state may be preserved without an SVE payload, in * which case the SVE context only has a header with VL==0, and all * state can be restored from the FPSIMD context. * * Streaming SVE state is always preserved with an SVE payload. For * consistency and robustness, reject restoring streaming SVE state * without an SVE payload. */ if (!sm && user->sve_size == sizeof(*user->sve)) return restore_fpsimd_context(user); vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); return -ENOMEM; } err = __copy_from_user(current->thread.sve_state, (char __user const *)user->sve + SVE_SIG_REGS_OFFSET, SVE_SIG_REGS_SIZE(vq)); if (err) return -EFAULT; if (flags & SVE_SIG_FLAG_SM) current->thread.svcr |= SVCR_SM_MASK; else set_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_SVE; err = read_fpsimd_context(&fpsimd, user); if (err) return err; /* Merge the FPSIMD registers into the SVE state */ fpsimd_update_current_state(&fpsimd); return 0; } #else /* ! CONFIG_ARM64_SVE */ static int restore_sve_fpsimd_context(struct user_ctxs *user) { WARN_ON_ONCE(1); return -EINVAL; } /* Turn any non-optimised out attempts to use this into a link error: */ extern int preserve_sve_context(void __user *ctx); #endif /* ! CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); int err = 0; __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(tpidr2_el0, &ctx->tpidr2, err); return err; } static int restore_tpidr2_context(struct user_ctxs *user) { u64 tpidr2_el0; int err = 0; if (user->tpidr2_size != sizeof(*user->tpidr2)) return -EINVAL; __get_user_error(tpidr2_el0, &user->tpidr2->tpidr2, err); if (!err) write_sysreg_s(tpidr2_el0, SYS_TPIDR2_EL0); return err; } static int preserve_za_context(struct za_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; unsigned int vl = task_get_sme_vl(current); unsigned int vq; if (thread_za_enabled(&current->thread)) vq = sve_vq_from_vl(vl); else vq = 0; memset(reserved, 0, sizeof(reserved)); __put_user_error(ZA_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16), &ctx->head.size, err); __put_user_error(vl, &ctx->vl, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); } return err ? -EFAULT : 0; } static int restore_za_context(struct user_ctxs *user) { int err = 0; unsigned int vq; u16 user_vl; if (user->za_size < sizeof(*user->za)) return -EINVAL; __get_user_error(user_vl, &(user->za->vl), err); if (err) return err; if (user_vl != task_get_sme_vl(current)) return -EINVAL; if (user->za_size == sizeof(*user->za)) { current->thread.svcr &= ~SVCR_ZA_MASK; return 0; } vq = sve_vq_from_vl(user_vl); if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; sme_alloc(current, true); if (!current->thread.sme_state) { current->thread.svcr &= ~SVCR_ZA_MASK; clear_thread_flag(TIF_SME); return -ENOMEM; } err = __copy_from_user(current->thread.sme_state, (char __user const *)user->za + ZA_SIG_REGS_OFFSET, ZA_SIG_REGS_SIZE(vq)); if (err) return -EFAULT; set_thread_flag(TIF_SME); current->thread.svcr |= SVCR_ZA_MASK; return 0; } static int preserve_zt_context(struct zt_context __user *ctx) { int err = 0; u16 reserved[ARRAY_SIZE(ctx->__reserved)]; if (WARN_ON(!thread_za_enabled(&current->thread))) return -EINVAL; memset(reserved, 0, sizeof(reserved)); __put_user_error(ZT_MAGIC, &ctx->head.magic, err); __put_user_error(round_up(ZT_SIG_CONTEXT_SIZE(1), 16), &ctx->head.size, err); __put_user_error(1, &ctx->nregs, err); BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(&current->thread), ZT_SIG_REGS_SIZE(1)); return err ? -EFAULT : 0; } static int restore_zt_context(struct user_ctxs *user) { int err; u16 nregs; /* ZA must be restored first for this check to be valid */ if (!thread_za_enabled(&current->thread)) return -EINVAL; if (user->zt_size != ZT_SIG_CONTEXT_SIZE(1)) return -EINVAL; if (__copy_from_user(&nregs, &(user->zt->nregs), sizeof(nregs))) return -EFAULT; if (nregs != 1) return -EINVAL; err = __copy_from_user(thread_zt_state(&current->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, ZT_SIG_REGS_SIZE(1)); if (err) return -EFAULT; return 0; } #else /* ! CONFIG_ARM64_SME */ /* Turn any non-optimised out attempts to use these into a link error: */ extern int preserve_tpidr2_context(void __user *ctx); extern int restore_tpidr2_context(struct user_ctxs *user); extern int preserve_za_context(void __user *ctx); extern int restore_za_context(struct user_ctxs *user); extern int preserve_zt_context(void __user *ctx); extern int restore_zt_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_SME */ #ifdef CONFIG_ARM64_GCS static int preserve_gcs_context(struct gcs_context __user *ctx) { int err = 0; u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0); /* * If GCS is enabled we will add a cap token to the frame, * include it in the GCSPR_EL0 we report to support stack * switching via sigreturn if GCS is enabled. We do not allow * enabling via sigreturn so the token is only relevant for * threads with GCS enabled. */ if (task_gcs_el0_enabled(current)) gcspr -= 8; __put_user_error(GCS_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(gcspr, &ctx->gcspr, err); __put_user_error(0, &ctx->reserved, err); __put_user_error(current->thread.gcs_el0_mode, &ctx->features_enabled, err); return err; } static int restore_gcs_context(struct user_ctxs *user) { u64 gcspr, enabled; int err = 0; if (user->gcs_size != sizeof(*user->gcs)) return -EINVAL; __get_user_error(gcspr, &user->gcs->gcspr, err); __get_user_error(enabled, &user->gcs->features_enabled, err); if (err) return err; /* Don't allow unknown modes */ if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) return -EINVAL; err = gcs_check_locked(current, enabled); if (err != 0) return err; /* Don't allow enabling */ if (!task_gcs_el0_enabled(current) && (enabled & PR_SHADOW_STACK_ENABLE)) return -EINVAL; /* If we are disabling disable everything */ if (!(enabled & PR_SHADOW_STACK_ENABLE)) enabled = 0; current->thread.gcs_el0_mode = enabled; /* * We let userspace set GCSPR_EL0 to anything here, we will * validate later in gcs_restore_signal(). */ write_sysreg_s(gcspr, SYS_GCSPR_EL0); return 0; } #else /* ! CONFIG_ARM64_GCS */ /* Turn any non-optimised out attempts to use these into a link error: */ extern int preserve_gcs_context(void __user *ctx); extern int restore_gcs_context(struct user_ctxs *user); #endif /* ! CONFIG_ARM64_GCS */ static int parse_user_sigframe(struct user_ctxs *user, struct rt_sigframe __user *sf) { struct sigcontext __user *const sc = &sf->uc.uc_mcontext; struct _aarch64_ctx __user *head; char __user *base = (char __user *)&sc->__reserved; size_t offset = 0; size_t limit = sizeof(sc->__reserved); bool have_extra_context = false; char const __user *const sfp = (char const __user *)sf; user->fpsimd = NULL; user->sve = NULL; user->tpidr2 = NULL; user->za = NULL; user->zt = NULL; user->fpmr = NULL; user->poe = NULL; user->gcs = NULL; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; while (1) { int err = 0; u32 magic, size; char const __user *userp; struct extra_context const __user *extra; u64 extra_datap; u32 extra_size; struct _aarch64_ctx const __user *end; u32 end_magic, end_size; if (limit - offset < sizeof(*head)) goto invalid; if (!IS_ALIGNED(offset, 16)) goto invalid; head = (struct _aarch64_ctx __user *)(base + offset); __get_user_error(magic, &head->magic, err); __get_user_error(size, &head->size, err); if (err) return err; if (limit - offset < size) goto invalid; switch (magic) { case 0: if (size) goto invalid; goto done; case FPSIMD_MAGIC: if (!system_supports_fpsimd()) goto invalid; if (user->fpsimd) goto invalid; user->fpsimd = (struct fpsimd_context __user *)head; user->fpsimd_size = size; break; case ESR_MAGIC: /* ignore */ break; case POE_MAGIC: if (!system_supports_poe()) goto invalid; if (user->poe) goto invalid; user->poe = (struct poe_context __user *)head; user->poe_size = size; break; case SVE_MAGIC: if (!system_supports_sve() && !system_supports_sme()) goto invalid; if (user->sve) goto invalid; user->sve = (struct sve_context __user *)head; user->sve_size = size; break; case TPIDR2_MAGIC: if (!system_supports_tpidr2()) goto invalid; if (user->tpidr2) goto invalid; user->tpidr2 = (struct tpidr2_context __user *)head; user->tpidr2_size = size; break; case ZA_MAGIC: if (!system_supports_sme()) goto invalid; if (user->za) goto invalid; user->za = (struct za_context __user *)head; user->za_size = size; break; case ZT_MAGIC: if (!system_supports_sme2()) goto invalid; if (user->zt) goto invalid; user->zt = (struct zt_context __user *)head; user->zt_size = size; break; case FPMR_MAGIC: if (!system_supports_fpmr()) goto invalid; if (user->fpmr) goto invalid; user->fpmr = (struct fpmr_context __user *)head; user->fpmr_size = size; break; case GCS_MAGIC: if (!system_supports_gcs()) goto invalid; if (user->gcs) goto invalid; user->gcs = (struct gcs_context __user *)head; user->gcs_size = size; break; case EXTRA_MAGIC: if (have_extra_context) goto invalid; if (size < sizeof(*extra)) goto invalid; userp = (char const __user *)head; extra = (struct extra_context const __user *)userp; userp += size; __get_user_error(extra_datap, &extra->datap, err); __get_user_error(extra_size, &extra->size, err); if (err) return err; /* Check for the dummy terminator in __reserved[]: */ if (limit - offset - size < TERMINATOR_SIZE) goto invalid; end = (struct _aarch64_ctx const __user *)userp; userp += TERMINATOR_SIZE; __get_user_error(end_magic, &end->magic, err); __get_user_error(end_size, &end->size, err); if (err) return err; if (end_magic || end_size) goto invalid; /* Prevent looping/repeated parsing of extra_context */ have_extra_context = true; base = (__force void __user *)extra_datap; if (!IS_ALIGNED((unsigned long)base, 16)) goto invalid; if (!IS_ALIGNED(extra_size, 16)) goto invalid; if (base != userp) goto invalid; /* Reject "unreasonably large" frames: */ if (extra_size > sfp + SIGFRAME_MAXSZ - userp) goto invalid; /* * Ignore trailing terminator in __reserved[] * and start parsing extra data: */ offset = 0; limit = extra_size; if (!access_ok(base, limit)) goto invalid; continue; default: goto invalid; } if (size < sizeof(*head)) goto invalid; if (limit - offset < size) goto invalid; offset += size; } done: return 0; invalid: return -EINVAL; } static int restore_sigframe(struct pt_regs *regs, struct rt_sigframe __user *sf, struct user_access_state *ua_state) { sigset_t set; int i, err; struct user_ctxs user; err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); if (err == 0) set_current_blocked(&set); for (i = 0; i < 31; i++) __get_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i], err); __get_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err); __get_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err); __get_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err); /* * Avoid sys_rt_sigreturn() restarting. */ forget_syscall(regs); fpsimd_save_and_flush_current_state(); err |= !valid_user_regs(&regs->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); if (err == 0 && system_supports_fpsimd()) { if (!user.fpsimd) return -EINVAL; if (user.sve) err = restore_sve_fpsimd_context(&user); else err = restore_fpsimd_context(&user); } if (err == 0 && system_supports_gcs() && user.gcs) err = restore_gcs_context(&user); if (err == 0 && system_supports_tpidr2() && user.tpidr2) err = restore_tpidr2_context(&user); if (err == 0 && system_supports_fpmr() && user.fpmr) err = restore_fpmr_context(&user); if (err == 0 && system_supports_sme() && user.za) err = restore_za_context(&user); if (err == 0 && system_supports_sme2() && user.zt) err = restore_zt_context(&user); if (err == 0 && system_supports_poe() && user.poe) err = restore_poe_context(&user, ua_state); return err; } #ifdef CONFIG_ARM64_GCS static int gcs_restore_signal(void) { u64 gcspr_el0, cap; int ret; if (!system_supports_gcs()) return 0; if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE)) return 0; gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Ensure that any changes to the GCS done via GCS operations * are visible to the normal reads we do to validate the * token. */ gcsb_dsync(); /* * GCSPR_EL0 should be pointing at a capped GCS, read the cap. * We don't enforce that this is in a GCS page, if it is not * then faults will be generated on GCS operations - the main * concern is to protect GCS pages. */ ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0, sizeof(cap)); if (ret) return -EFAULT; /* * Check that the cap is the actual GCS before replacing it. */ if (cap != GCS_SIGNAL_CAP(gcspr_el0)) return -EINVAL; /* Invalidate the token to prevent reuse */ put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret); if (ret != 0) return -EFAULT; write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0); return 0; } #else static int gcs_restore_signal(void) { return 0; } #endif SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 128-bit boundary, then 'sp' should * be word aligned here. */ if (regs->sp & 15) goto badframe; frame = (struct rt_sigframe __user *)regs->sp; if (!access_ok(frame, sizeof (*frame))) goto badframe; if (restore_sigframe(regs, frame, &ua_state)) goto badframe; if (gcs_restore_signal()) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; restore_user_access_state(&ua_state); return regs->regs[0]; badframe: arm64_notify_segfault(regs->sp); return 0; } /* * Determine the layout of optional records in the signal frame * * add_all: if true, lays out the biggest possible signal frame for * this task; otherwise, generates a layout for the current state * of the task. */ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, bool add_all) { int err; if (system_supports_fpsimd()) { err = sigframe_alloc(user, &user->fpsimd_offset, sizeof(struct fpsimd_context)); if (err) return err; } /* fault information, if valid */ if (add_all || current->thread.fault_code) { err = sigframe_alloc(user, &user->esr_offset, sizeof(struct esr_context)); if (err) return err; } #ifdef CONFIG_ARM64_GCS if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) { err = sigframe_alloc(user, &user->gcs_offset, sizeof(struct gcs_context)); if (err) return err; } #endif if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; if (add_all || current->thread.fp_type == FP_STATE_SVE || thread_sm_enabled(&current->thread)) { int vl = max(sve_max_vl(), sme_max_vl()); if (!add_all) vl = thread_get_cur_vl(&current->thread); vq = sve_vq_from_vl(vl); } err = sigframe_alloc(user, &user->sve_offset, SVE_SIG_CONTEXT_SIZE(vq)); if (err) return err; } if (system_supports_tpidr2()) { err = sigframe_alloc(user, &user->tpidr2_offset, sizeof(struct tpidr2_context)); if (err) return err; } if (system_supports_sme()) { unsigned int vl; unsigned int vq = 0; if (add_all) vl = sme_max_vl(); else vl = task_get_sme_vl(current); if (thread_za_enabled(&current->thread)) vq = sve_vq_from_vl(vl); err = sigframe_alloc(user, &user->za_offset, ZA_SIG_CONTEXT_SIZE(vq)); if (err) return err; } if (system_supports_sme2()) { if (add_all || thread_za_enabled(&current->thread)) { err = sigframe_alloc(user, &user->zt_offset, ZT_SIG_CONTEXT_SIZE(1)); if (err) return err; } } if (system_supports_fpmr()) { err = sigframe_alloc(user, &user->fpmr_offset, sizeof(struct fpmr_context)); if (err) return err; } if (system_supports_poe()) { err = sigframe_alloc(user, &user->poe_offset, sizeof(struct poe_context)); if (err) return err; } return sigframe_alloc_end(user); } static int setup_sigframe(struct rt_sigframe_user_layout *user, struct pt_regs *regs, sigset_t *set, const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; /* set up the stack frame for unwinding */ __put_user_error(regs->regs[29], &user->next_frame->fp, err); __put_user_error(regs->regs[30], &user->next_frame->lr, err); for (i = 0; i < 31; i++) __put_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i], err); __put_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err); __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err); __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err); __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); if (err == 0 && system_supports_fpsimd()) { struct fpsimd_context __user *fpsimd_ctx = apply_user_offset(user, user->fpsimd_offset); err |= preserve_fpsimd_context(fpsimd_ctx); } /* fault information, if valid */ if (err == 0 && user->esr_offset) { struct esr_context __user *esr_ctx = apply_user_offset(user, user->esr_offset); __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err); __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err); __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); } if (system_supports_gcs() && err == 0 && user->gcs_offset) { struct gcs_context __user *gcs_ctx = apply_user_offset(user, user->gcs_offset); err |= preserve_gcs_context(gcs_ctx); } /* Scalable Vector Extension state (including streaming), if present */ if ((system_supports_sve() || system_supports_sme()) && err == 0 && user->sve_offset) { struct sve_context __user *sve_ctx = apply_user_offset(user, user->sve_offset); err |= preserve_sve_context(sve_ctx); } /* TPIDR2 if supported */ if (system_supports_tpidr2() && err == 0) { struct tpidr2_context __user *tpidr2_ctx = apply_user_offset(user, user->tpidr2_offset); err |= preserve_tpidr2_context(tpidr2_ctx); } /* FPMR if supported */ if (system_supports_fpmr() && err == 0) { struct fpmr_context __user *fpmr_ctx = apply_user_offset(user, user->fpmr_offset); err |= preserve_fpmr_context(fpmr_ctx); } if (system_supports_poe() && err == 0) { struct poe_context __user *poe_ctx = apply_user_offset(user, user->poe_offset); err |= preserve_poe_context(poe_ctx, ua_state); } /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = apply_user_offset(user, user->za_offset); err |= preserve_za_context(za_ctx); } /* ZT state if present */ if (system_supports_sme2() && err == 0 && user->zt_offset) { struct zt_context __user *zt_ctx = apply_user_offset(user, user->zt_offset); err |= preserve_zt_context(zt_ctx); } if (err == 0 && user->extra_offset) { char __user *sfp = (char __user *)user->sigframe; char __user *userp = apply_user_offset(user, user->extra_offset); struct extra_context __user *extra; struct _aarch64_ctx __user *end; u64 extra_datap; u32 extra_size; extra = (struct extra_context __user *)userp; userp += EXTRA_CONTEXT_SIZE; end = (struct _aarch64_ctx __user *)userp; userp += TERMINATOR_SIZE; /* * extra_datap is just written to the signal frame. * The value gets cast back to a void __user * * during sigreturn. */ extra_datap = (__force u64)userp; extra_size = sfp + round_up(user->size, 16) - userp; __put_user_error(EXTRA_MAGIC, &extra->head.magic, err); __put_user_error(EXTRA_CONTEXT_SIZE, &extra->head.size, err); __put_user_error(extra_datap, &extra->datap, err); __put_user_error(extra_size, &extra->size, err); /* Add the terminator */ __put_user_error(0, &end->magic, err); __put_user_error(0, &end->size, err); } /* set the "end" magic */ if (err == 0) { struct _aarch64_ctx __user *end = apply_user_offset(user, user->end_offset); __put_user_error(0, &end->magic, err); __put_user_error(0, &end->size, err); } return err; } static int get_sigframe(struct rt_sigframe_user_layout *user, struct ksignal *ksig, struct pt_regs *regs) { unsigned long sp, sp_top; int err; init_user_layout(user); err = setup_sigframe_layout(user, false); if (err) return err; sp = sp_top = sigsp(regs->sp, ksig); sp = round_down(sp - sizeof(struct frame_record), 16); user->next_frame = (struct frame_record __user *)sp; sp = round_down(sp, 16) - sigframe_size(user); user->sigframe = (struct rt_sigframe __user *)sp; /* * Check that we can actually write to the signal frame. */ if (!access_ok(user->sigframe, sp_top - sp)) return -EFAULT; return 0; } #ifdef CONFIG_ARM64_GCS static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { u64 gcspr_el0; int ret = 0; if (!system_supports_gcs()) return 0; if (!task_gcs_el0_enabled(current)) return 0; /* * We are entering a signal handler, current register state is * active. */ gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0); /* * Push a cap and the GCS entry for the trampoline onto the GCS. */ put_user_gcs((unsigned long)sigtramp, (unsigned long __user *)(gcspr_el0 - 16), &ret); put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8), (unsigned long __user *)(gcspr_el0 - 8), &ret); if (ret != 0) return ret; gcspr_el0 -= 16; write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0); return 0; } #else static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig) { return 0; } #endif static int setup_return(struct pt_regs *regs, struct ksignal *ksig, struct rt_sigframe_user_layout *user, int usig) { __sigrestore_t sigtramp; int err; if (ksig->ka.sa.sa_flags & SA_RESTORER) sigtramp = ksig->ka.sa.sa_restorer; else sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); err = gcs_signal_entry(sigtramp, ksig); if (err) return err; /* * We must not fail from this point onwards. We are going to update * registers, including SP, in order to invoke the signal handler. If * we failed and attempted to deliver a nested SIGSEGV to a handler * after that point, the subsequent sigreturn would end up restoring * the (partial) state for the original signal handler. */ regs->regs[0] = usig; if (ksig->ka.sa.sa_flags & SA_SIGINFO) { regs->regs[1] = (unsigned long)&user->sigframe->info; regs->regs[2] = (unsigned long)&user->sigframe->uc; } regs->sp = (unsigned long)user->sigframe; regs->regs[29] = (unsigned long)&user->next_frame->fp; regs->regs[30] = (unsigned long)sigtramp; regs->pc = (unsigned long)ksig->ka.sa.sa_handler; /* * Signal delivery is a (wacky) indirect function call in * userspace, so simulate the same setting of BTYPE as a BLR * <register containing the signal handler entry point>. * Signal delivery to a location in a PROT_BTI guarded page * that is not a function entry point will now trigger a * SIGILL in userspace. * * If the signal handler entry point is not in a PROT_BTI * guarded page, this is harmless. */ if (system_supports_bti()) { regs->pstate &= ~PSR_BTYPE_MASK; regs->pstate |= PSR_BTYPE_C; } /* TCO (Tag Check Override) always cleared for signal handlers */ regs->pstate &= ~PSR_TCO_BIT; /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { task_smstop_sm(current); current->thread.svcr &= ~SVCR_ZA_MASK; write_sysreg_s(0, SYS_TPIDR2_EL0); } return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs) { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; struct user_access_state ua_state; int err = 0; fpsimd_save_and_flush_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); err |= setup_sigframe(&user, regs, set, &ua_state); if (ksig->ka.sa.sa_flags & SA_SIGINFO) err |= copy_siginfo_to_user(&frame->info, &ksig->info); if (err == 0) err = setup_return(regs, ksig, &user, usig); /* * We must not fail if setup_return() succeeded - see comment at the * beginning of setup_return(). */ if (err == 0) set_handler_user_access_state(); else restore_user_access_state(&ua_state); return err; } static void setup_restart_syscall(struct pt_regs *regs) { if (is_compat_task()) compat_setup_restart_syscall(regs); else regs->regs[8] = __NR_restart_syscall; } /* * OK, we're invoking a handler */ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { sigset_t *oldset = sigmask_to_save(); int usig = ksig->sig; int ret; rseq_signal_deliver(ksig, regs); /* * Set up the stack frame */ if (is_compat_task()) { if (ksig->ka.sa.sa_flags & SA_SIGINFO) ret = compat_setup_rt_frame(usig, ksig, oldset, regs); else ret = compat_setup_frame(usig, ksig, oldset, regs); } else { ret = setup_rt_frame(usig, ksig, oldset, regs); } /* * Check that the resulting registers are actually sane. */ ret |= !valid_user_regs(&regs->user_regs, current); /* Step into the signal handler if we are stepping */ signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP)); } /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. * * Note that we go through the signals twice: once to check the signals that * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ void do_signal(struct pt_regs *regs) { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; struct ksignal ksig; bool syscall = in_syscall(regs); /* * If we were from a system call, check for system call restarting... */ if (syscall) { continue_addr = regs->pc; restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4); retval = regs->regs[0]; /* * Avoid additional syscall restarting via ret_to_user. */ forget_syscall(regs); /* * Prepare for system call restart. We do this here so that a * debugger will see the already changed PC. */ switch (retval) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: case -ERESTART_RESTARTBLOCK: regs->regs[0] = regs->orig_x0; regs->pc = restart_addr; break; } } /* * Get the signal to deliver. When running under ptrace, at this point * the debugger may change all of our registers. */ if (get_signal(&ksig)) { /* * Depending on the signal settings, we may need to revert the * decision to restart the system call, but skip this if a * debugger has chosen to restart at a different PC. */ if (regs->pc == restart_addr && (retval == -ERESTARTNOHAND || retval == -ERESTART_RESTARTBLOCK || (retval == -ERESTARTSYS && !(ksig.ka.sa.sa_flags & SA_RESTART)))) { syscall_set_return_value(current, regs, -EINTR, 0); regs->pc = continue_addr; } handle_signal(&ksig, regs); return; } /* * Handle restarting a different system call. As above, if a debugger * has chosen to restart at a different PC, ignore the restart. */ if (syscall && regs->pc == restart_addr) { if (retval == -ERESTART_RESTARTBLOCK) setup_restart_syscall(regs); user_rewind_single_step(current); } restore_saved_sigmask(); } unsigned long __ro_after_init signal_minsigstksz; /* * Determine the stack space required for guaranteed signal devliery. * This function is used to populate AT_MINSIGSTKSZ at process startup. * cpufeatures setup is assumed to be complete. */ void __init minsigstksz_setup(void) { struct rt_sigframe_user_layout user; init_user_layout(&user); /* * If this fails, SIGFRAME_MAXSZ needs to be enlarged. It won't * be big enough, but it's our best guess: */ if (WARN_ON(setup_sigframe_layout(&user, true))) return; signal_minsigstksz = sigframe_size(&user) + round_up(sizeof(struct frame_record), 16) + 16; /* max alignment padding */ } /* * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as * changes likely come with new fields that should be added below. */ static_assert(NSIGILL == 11); static_assert(NSIGFPE == 15); static_assert(NSIGSEGV == 10); static_assert(NSIGBUS == 5); static_assert(NSIGTRAP == 6); static_assert(NSIGCHLD == 6); static_assert(NSIGSYS == 2); static_assert(sizeof(siginfo_t) == 128); static_assert(__alignof__(siginfo_t) == 8); static_assert(offsetof(siginfo_t, si_signo) == 0x00); static_assert(offsetof(siginfo_t, si_errno) == 0x04); static_assert(offsetof(siginfo_t, si_code) == 0x08); static_assert(offsetof(siginfo_t, si_pid) == 0x10); static_assert(offsetof(siginfo_t, si_uid) == 0x14); static_assert(offsetof(siginfo_t, si_tid) == 0x10); static_assert(offsetof(siginfo_t, si_overrun) == 0x14); static_assert(offsetof(siginfo_t, si_status) == 0x18); static_assert(offsetof(siginfo_t, si_utime) == 0x20); static_assert(offsetof(siginfo_t, si_stime) == 0x28); static_assert(offsetof(siginfo_t, si_value) == 0x18); static_assert(offsetof(siginfo_t, si_int) == 0x18); static_assert(offsetof(siginfo_t, si_ptr) == 0x18); static_assert(offsetof(siginfo_t, si_addr) == 0x10); static_assert(offsetof(siginfo_t, si_addr_lsb) == 0x18); static_assert(offsetof(siginfo_t, si_lower) == 0x20); static_assert(offsetof(siginfo_t, si_upper) == 0x28); static_assert(offsetof(siginfo_t, si_pkey) == 0x20); static_assert(offsetof(siginfo_t, si_perf_data) == 0x18); static_assert(offsetof(siginfo_t, si_perf_type) == 0x20); static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24); static_assert(offsetof(siginfo_t, si_band) == 0x10); static_assert(offsetof(siginfo_t, si_fd) == 0x18); static_assert(offsetof(siginfo_t, si_call_addr) == 0x10); static_assert(offsetof(siginfo_t, si_syscall) == 0x18); static_assert(offsetof(siginfo_t, si_arch) == 0x1c);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PATH_H #define _LINUX_PATH_H struct dentry; struct vfsmount; struct path { struct vfsmount *mnt; struct dentry *dentry; } __randomize_layout; extern void path_get(const struct path *); extern void path_put(const struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } /* * Cleanup macro for use with __free(path_put). Avoids dereference and * copying @path unlike DEFINE_FREE(). path_put() will handle the empty * path correctly just ensure @path is initialized: * * struct path path __free(path_put) = {}; */ #define __free_path_put path_put #endif /* _LINUX_PATH_H */
19 19 11 1 11 2 1 1 1 1 3 1 1 269 271 265 6 5 264 106 106 106 26 26 100 87 87 52 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 4 1 2 1 1 2 26 269 271 83 83 82 82 1 81 79 19 19 9 6 294 293 294 294 291 285 293 288 6 294 293 294 291 294 235 235 235 235 235 12 1 1 4 1 2 4 1 1 1 1 9 6 9 291 270 31 31 31 31 23 291 2 1 5 179 115 289 1 250 38 284 284 16 8 8 8 8 11 11 8 9 9 9 6 6 6 284 58 55 55 54 55 55 55 55 269 19 19 112 179 2 288 294 285 16 1 282 291 256 35 290 291 291 41 22 21 43 64 288 288 255 33 111 177 286 288 234 288 288 288 288 92 142 235 22 5 2 8 6 23 23 1 1 7 8 1 4 2 1 4 97 97 97 96 97 97 97 1 3 97 87 9 9 77 89 89 89 89 89 102 101 89 3 5 91 11 102 91 2 94 10 83 55 14 5 9 4 8 437 305 2 102 5 2 301 212 94 1 1 2 1 1 2 1 55 2 14 2 5 3 2 21 1 5 7 1 3 1 1 1 14 117 2 1 2 4 73 1 1 2 4 2 2 2 15 1 4 49 48 49 43 6 8 8 49 7 29 43 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall <c.dall@virtualopensystems.com> */ #include <linux/bug.h> #include <linux/cpu_pm.h> #include <linux/entry-kvm.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> #include <linux/list.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/fs.h> #include <linux/mman.h> #include <linux/sched.h> #include <linux/kvm.h> #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> #include <linux/psci.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS #include "trace_arm.h" #include <linux/uaccess.h> #include <asm/ptrace.h> #include <asm/mman.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> #include <asm/virt.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> #include <asm/kvm_pkvm.h> #include <asm/kvm_ptrauth.h> #include <asm/sections.h> #include <kvm/arm_hypercalls.h> #include <kvm/arm_pmu.h> #include <kvm/arm_psci.h> #include "sys_regs.h" static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; enum kvm_wfx_trap_policy { KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */ KVM_WFX_NOTRAP, KVM_WFX_TRAP, }; static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK; DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); static bool vgic_present, kvm_arm_initialised; static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized); bool is_kvm_arm_initialised(void) { return kvm_arm_initialised; } int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r = -EINVAL; if (cap->flags) return -EINVAL; if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) return -EINVAL; switch (cap->cap) { case KVM_CAP_ARM_NISV_TO_USER: r = 0; set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER, &kvm->arch.flags); break; case KVM_CAP_ARM_MTE: mutex_lock(&kvm->lock); if (system_supports_mte() && !kvm->created_vcpus) { r = 0; set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); } mutex_unlock(&kvm->lock); break; case KVM_CAP_ARM_SYSTEM_SUSPEND: r = 0; set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); break; case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: mutex_lock(&kvm->slots_lock); /* * To keep things simple, allow changing the chunk * size only when no memory slots have been created. */ if (kvm_are_all_memslots_empty(kvm)) { u64 new_cap = cap->args[0]; if (!new_cap || kvm_is_block_size_supported(new_cap)) { r = 0; kvm->arch.mmu.split_page_chunk_size = new_cap; } } mutex_unlock(&kvm->slots_lock); break; case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: mutex_lock(&kvm->lock); if (!kvm->created_vcpus) { r = 0; set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags); } mutex_unlock(&kvm->lock); break; default: break; } return r; } static int kvm_arm_default_max_vcpus(void) { return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; } /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct * @type: kvm device type */ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int ret; mutex_init(&kvm->arch.config_lock); #ifdef CONFIG_LOCKDEP /* Clue in lockdep that the config_lock must be taken inside kvm->lock */ mutex_lock(&kvm->lock); mutex_lock(&kvm->arch.config_lock); mutex_unlock(&kvm->arch.config_lock); mutex_unlock(&kvm->lock); #endif kvm_init_nested(kvm); ret = kvm_share_hyp(kvm, kvm + 1); if (ret) return ret; ret = pkvm_init_host_vm(kvm); if (ret) goto err_unshare_kvm; if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) { ret = -ENOMEM; goto err_unshare_kvm; } cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask); ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type); if (ret) goto err_free_cpumask; kvm_vgic_early_init(kvm); kvm_timer_init_vm(kvm); /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->max_vcpus = kvm_arm_default_max_vcpus(); kvm_arm_init_hypercalls(kvm); bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); return 0; err_free_cpumask: free_cpumask_var(kvm->arch.supported_cpus); err_unshare_kvm: kvm_unshare_hyp(kvm, kvm + 1); return ret; } vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } void kvm_arch_create_vm_debugfs(struct kvm *kvm) { kvm_sys_regs_create_debugfs(kvm); kvm_s2_ptdump_create_debugfs(kvm); } static void kvm_destroy_mpidr_data(struct kvm *kvm) { struct kvm_mpidr_data *data; mutex_lock(&kvm->arch.config_lock); data = rcu_dereference_protected(kvm->arch.mpidr_data, lockdep_is_held(&kvm->arch.config_lock)); if (data) { rcu_assign_pointer(kvm->arch.mpidr_data, NULL); synchronize_rcu(); kfree(data); } mutex_unlock(&kvm->arch.config_lock); } /** * kvm_arch_destroy_vm - destroy the VM data structure * @kvm: pointer to the KVM struct */ void kvm_arch_destroy_vm(struct kvm *kvm) { bitmap_free(kvm->arch.pmu_filter); free_cpumask_var(kvm->arch.supported_cpus); kvm_vgic_destroy(kvm); if (is_protected_kvm_enabled()) pkvm_destroy_hyp_vm(kvm); kvm_destroy_mpidr_data(kvm); kfree(kvm->arch.sysreg_masks); kvm_destroy_vcpus(kvm); kvm_unshare_hyp(kvm, kvm + 1); kvm_arm_teardown_hypercalls(kvm); } static bool kvm_has_full_ptr_auth(void) { bool apa, gpa, api, gpi, apa3, gpa3; u64 isar1, isar2, val; /* * Check that: * * - both Address and Generic auth are implemented for a given * algorithm (Q5, IMPDEF or Q3) * - only a single algorithm is implemented. */ if (!system_has_full_ptr_auth()) return false; isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1); val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1); gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP); api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1); val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1); gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP); apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2); val = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2); gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP); return (apa == gpa && api == gpi && apa3 == gpa3 && (apa + api + apa3) == 1); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) return 0; switch (ext) { case KVM_CAP_IRQCHIP: r = vgic_present; break; case KVM_CAP_IOEVENTFD: case KVM_CAP_USER_MEMORY: case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_ONE_REG: case KVM_CAP_ARM_PSCI: case KVM_CAP_ARM_PSCI_0_2: case KVM_CAP_READONLY_MEM: case KVM_CAP_MP_STATE: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2: case KVM_CAP_ARM_NISV_TO_USER: case KVM_CAP_ARM_INJECT_EXT_DABT: case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: case KVM_CAP_ARM_SYSTEM_SUSPEND: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: return KVM_GUESTDBG_VALID_MASK; case KVM_CAP_ARM_SET_DEVICE_ADDR: r = 1; break; case KVM_CAP_NR_VCPUS: /* * ARM64 treats KVM_CAP_NR_CPUS differently from all other * architectures, as it does not always bound it to * KVM_CAP_MAX_VCPUS. It should not matter much because * this is just an advisory value. */ r = min_t(unsigned int, num_online_cpus(), kvm_arm_default_max_vcpus()); break; case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: if (kvm) r = kvm->max_vcpus; else r = kvm_arm_default_max_vcpus(); break; case KVM_CAP_MSI_DEVID: if (!kvm) r = -EINVAL; else r = kvm->arch.vgic.msis_require_devid; break; case KVM_CAP_ARM_USER_IRQ: /* * 1: EL1_VTIMER, EL1_PTIMER, and PMU. * (bump this number if adding more devices) */ r = 1; break; case KVM_CAP_ARM_MTE: r = system_supports_mte(); break; case KVM_CAP_STEAL_TIME: r = kvm_arm_pvtime_supported(); break; case KVM_CAP_ARM_EL1_32BIT: r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1); break; case KVM_CAP_ARM_EL2: r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT); break; case KVM_CAP_ARM_EL2_E2H0: r = cpus_have_final_cap(ARM64_HAS_HCR_NV1); break; case KVM_CAP_GUEST_DEBUG_HW_BPS: r = get_num_brps(); break; case KVM_CAP_GUEST_DEBUG_HW_WPS: r = get_num_wrps(); break; case KVM_CAP_ARM_PMU_V3: r = kvm_supports_guest_pmuv3(); break; case KVM_CAP_ARM_INJECT_SERROR_ESR: r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); break; case KVM_CAP_ARM_VM_IPA_SIZE: r = get_kvm_ipa_limit(); break; case KVM_CAP_ARM_SVE: r = system_supports_sve(); break; case KVM_CAP_ARM_PTRAUTH_ADDRESS: case KVM_CAP_ARM_PTRAUTH_GENERIC: r = kvm_has_full_ptr_auth(); break; case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: if (kvm) r = kvm->arch.mmu.split_page_chunk_size; else r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; break; case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: r = kvm_supported_block_sizes(); break; case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES: r = BIT(0); break; case KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED: if (!kvm) r = -EINVAL; else r = kvm_supports_cacheable_pfnmap(); break; default: r = 0; } return r; } long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { return -EINVAL; } struct kvm *kvm_arch_alloc_vm(void) { size_t sz = sizeof(struct kvm); if (!has_vhe()) return kzalloc(sz, GFP_KERNEL_ACCOUNT); return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO); } int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) return -EBUSY; if (id >= kvm->max_vcpus) return -EINVAL; return 0; } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { int err; spin_lock_init(&vcpu->arch.mp_state_lock); #ifdef CONFIG_LOCKDEP /* Inform lockdep that the config_lock is acquired after vcpu->mutex */ mutex_lock(&vcpu->mutex); mutex_lock(&vcpu->kvm->arch.config_lock); mutex_unlock(&vcpu->kvm->arch.config_lock); mutex_unlock(&vcpu->mutex); #endif /* Force users to call KVM_ARM_VCPU_INIT */ vcpu_clear_flag(vcpu, VCPU_INITIALIZED); vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; /* Set up the timer */ kvm_timer_vcpu_init(vcpu); kvm_pmu_vcpu_init(vcpu); kvm_arm_pvtime_vcpu_init(&vcpu->arch); vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; /* * This vCPU may have been created after mpidr_data was initialized. * Throw out the pre-computed mappings if that is the case which forces * KVM to fall back to iteratively searching the vCPUs. */ kvm_destroy_mpidr_data(vcpu->kvm); err = kvm_vgic_vcpu_init(vcpu); if (err) return err; err = kvm_share_hyp(vcpu, vcpu + 1); if (err) kvm_vgic_vcpu_destroy(vcpu); return err; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { if (!is_protected_kvm_enabled()) kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); else free_hyp_memcache(&vcpu->arch.pkvm_memcache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); kvm_vgic_vcpu_destroy(vcpu); kvm_arm_vcpu_destroy(vcpu); } void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { } static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu) { if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) { /* * Either we're running an L2 guest, and the API/APK bits come * from L1's HCR_EL2, or API/APK are both set. */ if (unlikely(is_nested_ctxt(vcpu))) { u64 val; val = __vcpu_sys_reg(vcpu, HCR_EL2); val &= (HCR_API | HCR_APK); vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK); vcpu->arch.hcr_el2 |= val; } else { vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); } /* * Save the host keys if there is any chance for the guest * to use pauth, as the entry code will reload the guest * keys in that case. */ if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) { struct kvm_cpu_context *ctxt; ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt); ptrauth_save_keys(ctxt); } } } static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu) { if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) return kvm_wfi_trap_policy == KVM_WFX_NOTRAP; return single_task_running() && (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || vcpu->kvm->arch.vgic.nassgireq); } static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu) { if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK)) return kvm_wfe_trap_policy == KVM_WFX_NOTRAP; return single_task_running(); } void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct kvm_s2_mmu *mmu; int *last_ran; if (is_protected_kvm_enabled()) goto nommu; if (vcpu_has_nv(vcpu)) kvm_vcpu_load_hw_mmu(vcpu); mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); /* * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, * which happens eagerly in VHE. * * Also, the VMID allocator only preserves VMIDs that are active at the * time of rollover, so KVM might need to grab a new VMID for the MMU if * this is called from kvm_sched_in(). */ kvm_arm_vmid_update(&mmu->vmid); /* * We guarantee that both TLBs and I-cache are private to each * vcpu. If detecting that a vcpu from the same VM has * previously run on the same physical CPU, call into the * hypervisor code to nuke the relevant contexts. * * We might get preempted before the vCPU actually runs, but * over-invalidation doesn't affect correctness. */ if (*last_ran != vcpu->vcpu_idx) { kvm_call_hyp(__kvm_flush_cpu_context, mmu); *last_ran = vcpu->vcpu_idx; } nommu: vcpu->cpu = cpu; /* * The timer must be loaded before the vgic to correctly set up physical * interrupt deactivation in nested state (e.g. timer interrupt). */ kvm_timer_vcpu_load(vcpu); kvm_vgic_load(vcpu); kvm_vcpu_load_debug(vcpu); if (has_vhe()) kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); kvm_vcpu_pmu_restore_guest(vcpu); if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); if (kvm_vcpu_should_clear_twe(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_TWE; else vcpu->arch.hcr_el2 |= HCR_TWE; if (kvm_vcpu_should_clear_twi(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_TWI; else vcpu->arch.hcr_el2 |= HCR_TWI; vcpu_set_pauth_traps(vcpu); if (is_protected_kvm_enabled()) { kvm_call_hyp_nvhe(__pkvm_vcpu_load, vcpu->kvm->arch.pkvm.handle, vcpu->vcpu_idx, vcpu->arch.hcr_el2); kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, &vcpu->arch.vgic_cpu.vgic_v3); } if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { if (is_protected_kvm_enabled()) { kvm_call_hyp(__vgic_v3_save_vmcr_aprs, &vcpu->arch.vgic_cpu.vgic_v3); kvm_call_hyp_nvhe(__pkvm_vcpu_put); } kvm_vcpu_put_debug(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_vhe(vcpu); kvm_timer_vcpu_put(vcpu); kvm_vgic_put(vcpu); kvm_vcpu_pmu_restore_host(vcpu); if (vcpu_has_nv(vcpu)) kvm_vcpu_put_hw_mmu(vcpu); kvm_arm_vmid_clear_active(); vcpu_clear_on_unsupported_cpu(vcpu); vcpu->cpu = -1; } static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) { WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED); kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->arch.mp_state_lock); __kvm_arm_vcpu_power_off(vcpu); spin_unlock(&vcpu->arch.mp_state_lock); } bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED; } static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) { WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED); kvm_make_request(KVM_REQ_SUSPEND, vcpu); kvm_vcpu_kick(vcpu); } static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) { return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { *mp_state = READ_ONCE(vcpu->arch.mp_state); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { int ret = 0; spin_lock(&vcpu->arch.mp_state_lock); switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: WRITE_ONCE(vcpu->arch.mp_state, *mp_state); break; case KVM_MP_STATE_STOPPED: __kvm_arm_vcpu_power_off(vcpu); break; case KVM_MP_STATE_SUSPENDED: kvm_arm_vcpu_suspend(vcpu); break; default: ret = -EINVAL; } spin_unlock(&vcpu->arch.mp_state_lock); return ret; } /** * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled * @v: The VCPU pointer * * If the guest CPU is not waiting for interrupts or an interrupt line is * asserted, the CPU is by definition runnable. */ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF | HCR_VSE); return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); } bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { return vcpu_mode_priv(vcpu); } #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) { return *vcpu_pc(vcpu); } #endif static void kvm_init_mpidr_data(struct kvm *kvm) { struct kvm_mpidr_data *data = NULL; unsigned long c, mask, nr_entries; u64 aff_set = 0, aff_clr = ~0UL; struct kvm_vcpu *vcpu; mutex_lock(&kvm->arch.config_lock); if (rcu_access_pointer(kvm->arch.mpidr_data) || atomic_read(&kvm->online_vcpus) == 1) goto out; kvm_for_each_vcpu(c, vcpu, kvm) { u64 aff = kvm_vcpu_get_mpidr_aff(vcpu); aff_set |= aff; aff_clr &= aff; } /* * A significant bit can be either 0 or 1, and will only appear in * aff_set. Use aff_clr to weed out the useless stuff. */ mask = aff_set ^ aff_clr; nr_entries = BIT_ULL(hweight_long(mask)); /* * Don't let userspace fool us. If we need more than a single page * to describe the compressed MPIDR array, just fall back to the * iterative method. Single vcpu VMs do not need this either. */ if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE) data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries), GFP_KERNEL_ACCOUNT); if (!data) goto out; data->mpidr_mask = mask; kvm_for_each_vcpu(c, vcpu, kvm) { u64 aff = kvm_vcpu_get_mpidr_aff(vcpu); u16 index = kvm_mpidr_index(data, aff); data->cmpidr_to_idx[index] = c; } rcu_assign_pointer(kvm->arch.mpidr_data, data); out: mutex_unlock(&kvm->arch.config_lock); } /* * Handle both the initialisation that is being done when the vcpu is * run for the first time, as well as the updates that must be * performed each time we get a new thread dealing with this vcpu. */ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; int ret; if (!kvm_vcpu_initialized(vcpu)) return -ENOEXEC; if (!kvm_arm_vcpu_is_finalized(vcpu)) return -EPERM; ret = kvm_arch_vcpu_run_map_fp(vcpu); if (ret) return ret; if (likely(vcpu_has_run_once(vcpu))) return 0; kvm_init_mpidr_data(kvm); if (likely(irqchip_in_kernel(kvm))) { /* * Map the VGIC hardware resources before running a vcpu the * first time on this VM. */ ret = kvm_vgic_map_resources(kvm); if (ret) return ret; } ret = kvm_finalize_sys_regs(vcpu); if (ret) return ret; if (vcpu_has_nv(vcpu)) { ret = kvm_vcpu_allocate_vncr_tlb(vcpu); if (ret) return ret; ret = kvm_vgic_vcpu_nv_init(vcpu); if (ret) return ret; } /* * This needs to happen after any restriction has been applied * to the feature set. */ kvm_calculate_traps(vcpu); ret = kvm_timer_enable(vcpu); if (ret) return ret; if (kvm_vcpu_has_pmu(vcpu)) { ret = kvm_arm_pmu_v3_enable(vcpu); if (ret) return ret; } if (is_protected_kvm_enabled()) { ret = pkvm_create_hyp_vm(kvm); if (ret) return ret; ret = pkvm_create_hyp_vcpu(vcpu); if (ret) return ret; } mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); return ret; } bool kvm_arch_intc_initialized(struct kvm *kvm) { return vgic_initialized(kvm); } void kvm_arm_halt_guest(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) vcpu->arch.pause = true; kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP); } void kvm_arm_resume_guest(struct kvm *kvm) { unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.pause = false; __kvm_vcpu_wake_up(vcpu); } } static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); rcuwait_wait_event(wait, (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), TASK_INTERRUPTIBLE); if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) { /* Awaken to handle a signal, request we sleep again later. */ kvm_make_request(KVM_REQ_SLEEP, vcpu); } /* * Make sure we will observe a potential reset request if we've * observed a change to the power state. Pairs with the smp_wmb() in * kvm_psci_vcpu_on(). */ smp_rmb(); } /** * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior * @vcpu: The VCPU pointer * * Suspend execution of a vCPU until a valid wake event is detected, i.e. until * the vCPU is runnable. The vCPU may or may not be scheduled out, depending * on when a wake event arrives, e.g. there may already be a pending wake event. */ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) { /* * Sync back the state of the GIC CPU interface so that we have * the latest PMR and group enables. This ensures that * kvm_arch_vcpu_runnable has up-to-date data to decide whether * we have pending interrupts, e.g. when determining if the * vCPU should block. * * For the same reason, we want to tell GICv4 that we need * doorbells to be signalled, should an interrupt become pending. */ preempt_disable(); vcpu_set_flag(vcpu, IN_WFI); kvm_vgic_put(vcpu); preempt_enable(); kvm_vcpu_halt(vcpu); vcpu_clear_flag(vcpu, IN_WFIT); preempt_disable(); vcpu_clear_flag(vcpu, IN_WFI); kvm_vgic_load(vcpu); preempt_enable(); } static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) { if (!kvm_arm_vcpu_suspended(vcpu)) return 1; kvm_vcpu_wfi(vcpu); /* * The suspend state is sticky; we do not leave it until userspace * explicitly marks the vCPU as runnable. Request that we suspend again * later. */ kvm_make_request(KVM_REQ_SUSPEND, vcpu); /* * Check to make sure the vCPU is actually runnable. If so, exit to * userspace informing it of the wakeup condition. */ if (kvm_arch_vcpu_runnable(vcpu)) { memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP; vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; return 0; } /* * Otherwise, we were unblocked to process a different event, such as a * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to * process the event. */ return 1; } /** * check_vcpu_requests - check and handle pending vCPU requests * @vcpu: the VCPU pointer * * Return: 1 if we should enter the guest * 0 if we should exit to userspace * < 0 if we should exit to userspace, where the return value indicates * an error */ static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) return -EIO; if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) kvm_vcpu_sleep(vcpu); if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) kvm_reset_vcpu(vcpu); /* * Clear IRQ_PENDING requests that were made to guarantee * that a VCPU sees new virtual interrupts. */ kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu); if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu)) kvm_update_stolen_time(vcpu); if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) { /* The distributor enable bits were changed */ preempt_disable(); vgic_v4_put(vcpu); vgic_v4_load(vcpu); preempt_enable(); } if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) kvm_vcpu_reload_pmu(vcpu); if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu)) kvm_vcpu_pmu_restore_guest(vcpu); if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) return kvm_vcpu_suspend(vcpu); if (kvm_dirty_ring_check_request(vcpu)) return 0; check_nested_vcpu_requests(vcpu); } return 1; } static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) { if (likely(!vcpu_mode_is_32bit(vcpu))) return false; if (vcpu_has_nv(vcpu)) return true; return !kvm_supports_32bit_el0(); } /** * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest * @vcpu: The VCPU pointer * @ret: Pointer to write optional return code * * Returns: true if the VCPU needs to return to a preemptible + interruptible * and skip guest entry. * * This function disambiguates between two different types of exits: exits to a * preemptible + interruptible kernel context and exits to userspace. For an * exit to userspace, this function will write the return code to ret and return * true. For an exit to preemptible + interruptible kernel context (i.e. check * for pending work and re-enter), return true without writing to ret. */ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) { struct kvm_run *run = vcpu->run; /* * If we're using a userspace irqchip, then check if we need * to tell a userspace irqchip about timer or PMU level * changes and if so, exit to userspace (the actual level * state gets updated in kvm_timer_update_run and * kvm_pmu_update_run below). */ if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { if (kvm_timer_should_notify_user(vcpu) || kvm_pmu_should_notify_user(vcpu)) { *ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; return true; } } if (unlikely(vcpu_on_unsupported_cpu(vcpu))) { run->exit_reason = KVM_EXIT_FAIL_ENTRY; run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED; run->fail_entry.cpu = smp_processor_id(); *ret = 0; return true; } return kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } /* * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while * the vCPU is running. * * This must be noinstr as instrumentation may make use of RCU, and this is not * safe during the EQS. */ static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu) { int ret; guest_state_enter_irqoff(); ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); guest_state_exit_irqoff(); return ret; } /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer * * This function is called through the VCPU_RUN ioctl called from user space. It * will execute VM code in a loop until the time slice for the process is used * or some emulation is needed from user space in which case the function will * return with return value 0 and with the kvm_run structure filled in with the * required data for the requested emulation. */ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; int ret; if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu); if (ret <= 0) return ret; } vcpu_load(vcpu); if (!vcpu->wants_to_run) { ret = -EINTR; goto out; } kvm_sigset_activate(vcpu); ret = 1; run->exit_reason = KVM_EXIT_UNKNOWN; run->flags = 0; while (ret > 0) { /* * Check conditions before entering the guest */ ret = xfer_to_guest_mode_handle_work(vcpu); if (!ret) ret = 1; if (ret > 0) ret = check_vcpu_requests(vcpu); /* * Preparing the interrupts to be injected also * involves poking the GIC, which must be done in a * non-preemptible context. */ preempt_disable(); kvm_nested_flush_hwstate(vcpu); if (kvm_vcpu_has_pmu(vcpu)) kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); kvm_vgic_flush_hwstate(vcpu); kvm_pmu_update_vcpu_events(vcpu); /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and * Documentation/virt/kvm/vcpu-requests.rst */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ if (kvm_vcpu_has_pmu(vcpu)) kvm_pmu_sync_hwstate(vcpu); if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); preempt_enable(); continue; } kvm_arch_vcpu_ctxflush_fp(vcpu); /************************************************************** * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); guest_timing_enter_irqoff(); ret = kvm_arm_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; /* * Back from guest *************************************************************/ /* * We must sync the PMU state before the vgic state so * that the vgic can properly sample the updated state of the * interrupt line. */ if (kvm_vcpu_has_pmu(vcpu)) kvm_pmu_sync_hwstate(vcpu); /* * Sync the vgic state before syncing the timer state because * the timer code needs to know if the virtual timer * interrupts are active. */ kvm_vgic_sync_hwstate(vcpu); /* * Sync the timer hardware state before enabling interrupts as * we don't want vtimer interrupts to race with syncing the * timer virtual interrupt state. */ if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); if (is_hyp_ctxt(vcpu)) kvm_timer_sync_nested(vcpu); kvm_arch_vcpu_ctxsync_fp(vcpu); /* * We must ensure that any pending interrupts are taken before * we exit guest timing so that timer ticks are accounted as * guest time. Transiently unmask interrupts so that any * pending interrupts are taken. * * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other * context synchronization event) is necessary to ensure that * pending interrupts are taken. */ if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) { local_irq_enable(); isb(); local_irq_disable(); } guest_timing_exit_irqoff(); local_irq_enable(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); kvm_nested_sync_hwstate(vcpu); preempt_enable(); /* * The ARMv8 architecture doesn't give the hypervisor * a mechanism to prevent a guest from dropping to AArch32 EL0 * if implemented by the CPU. If we spot the guest in such * state and that we decided it wasn't supposed to do so (like * with the asymmetric AArch32 case), return to userspace with * a fatal error. */ if (vcpu_mode_is_bad_32bit(vcpu)) { /* * As we have caught the guest red-handed, decide that * it isn't fit for purpose anymore by making the vcpu * invalid. The VMM can try and fix it by issuing a * KVM_ARM_VCPU_INIT if it really wants to. */ vcpu_clear_flag(vcpu, VCPU_INITIALIZED); ret = ARM_EXCEPTION_IL; } ret = handle_exit(vcpu, ret); } /* Tell userspace about in-kernel device output levels */ if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { kvm_timer_update_run(vcpu); kvm_pmu_update_run(vcpu); } kvm_sigset_deactivate(vcpu); out: /* * In the unlikely event that we are returning to userspace * with pending exceptions or PC adjustment, commit these * adjustments in order to give userspace a consistent view of * the vcpu state. Note that this relies on __kvm_adjust_pc() * being preempt-safe on VHE. */ if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) || vcpu_get_flag(vcpu, INCREMENT_PC))) kvm_call_hyp(__kvm_adjust_pc, vcpu); vcpu_put(vcpu); return ret; } static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level) { int bit_index; bool set; unsigned long *hcr; if (number == KVM_ARM_IRQ_CPU_IRQ) bit_index = __ffs(HCR_VI); else /* KVM_ARM_IRQ_CPU_FIQ */ bit_index = __ffs(HCR_VF); hcr = vcpu_hcr(vcpu); if (level) set = test_and_set_bit(bit_index, hcr); else set = test_and_clear_bit(bit_index, hcr); /* * If we didn't change anything, no need to wake up or kick other CPUs */ if (set == level) return 0; /* * The vcpu irq_lines field was updated, wake up sleeping VCPUs and * trigger a world-switch round on the running physical CPU to set the * virtual IRQ/FIQ fields in the HCR appropriately. */ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); kvm_vcpu_kick(vcpu); return 0; } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, bool line_status) { u32 irq = irq_level->irq; unsigned int irq_type, vcpu_id, irq_num; struct kvm_vcpu *vcpu = NULL; bool level = irq_level->level; irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK; vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK; vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1); irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK; trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level); switch (irq_type) { case KVM_ARM_IRQ_TYPE_CPU: if (irqchip_in_kernel(kvm)) return -ENXIO; vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); if (!vcpu) return -EINVAL; if (irq_num > KVM_ARM_IRQ_CPU_FIQ) return -EINVAL; return vcpu_interrupt_line(vcpu, irq_num, level); case KVM_ARM_IRQ_TYPE_PPI: if (!irqchip_in_kernel(kvm)) return -ENXIO; vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); if (!vcpu) return -EINVAL; if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS) return -EINVAL; return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL); case KVM_ARM_IRQ_TYPE_SPI: if (!irqchip_in_kernel(kvm)) return -ENXIO; if (irq_num < VGIC_NR_PRIVATE_IRQS) return -EINVAL; return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL); } return -EINVAL; } static unsigned long system_supported_vcpu_features(void) { unsigned long features = KVM_VCPU_VALID_FEATURES; if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1)) clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features); if (!kvm_supports_guest_pmuv3()) clear_bit(KVM_ARM_VCPU_PMU_V3, &features); if (!system_supports_sve()) clear_bit(KVM_ARM_VCPU_SVE, &features); if (!kvm_has_full_ptr_auth()) { clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features); clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features); } if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) clear_bit(KVM_ARM_VCPU_HAS_EL2, &features); return features; } static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned long features = init->features[0]; int i; if (features & ~KVM_VCPU_VALID_FEATURES) return -ENOENT; for (i = 1; i < ARRAY_SIZE(init->features); i++) { if (init->features[i]) return -ENOENT; } if (features & ~system_supported_vcpu_features()) return -EINVAL; /* * For now make sure that both address/generic pointer authentication * features are requested by the userspace together. */ if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) != test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features)) return -EINVAL; if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) return 0; /* MTE is incompatible with AArch32 */ if (kvm_has_mte(vcpu->kvm)) return -EINVAL; /* NV is incompatible with AArch32 */ if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) return -EINVAL; return 0; } static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned long features = init->features[0]; return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); } static int kvm_setup_vcpu(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; int ret = 0; /* * When the vCPU has a PMU, but no PMU is set for the guest * yet, set the default one. */ if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu) ret = kvm_arm_set_default_pmu(kvm); /* Prepare for nested if required */ if (!ret && vcpu_has_nv(vcpu)) ret = kvm_vcpu_init_nested(vcpu); return ret; } static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned long features = init->features[0]; struct kvm *kvm = vcpu->kvm; int ret = -EINVAL; mutex_lock(&kvm->arch.config_lock); if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) && kvm_vcpu_init_changed(vcpu, init)) goto out_unlock; bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); ret = kvm_setup_vcpu(vcpu); if (ret) goto out_unlock; /* Now we know what it is, we can reset it. */ kvm_reset_vcpu(vcpu); set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags); vcpu_set_flag(vcpu, VCPU_INITIALIZED); ret = 0; out_unlock: mutex_unlock(&kvm->arch.config_lock); return ret; } static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { int ret; if (init->target != KVM_ARM_TARGET_GENERIC_V8 && init->target != kvm_target_cpu()) return -EINVAL; ret = kvm_vcpu_init_check_features(vcpu, init); if (ret) return ret; if (!kvm_vcpu_initialized(vcpu)) return __kvm_vcpu_set_target(vcpu, init); if (kvm_vcpu_init_changed(vcpu, init)) return -EINVAL; kvm_reset_vcpu(vcpu); return 0; } static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { bool power_off = false; int ret; /* * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid * reflecting it in the finalized feature set, thus limiting its scope * to a single KVM_ARM_VCPU_INIT call. */ if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) { init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF); power_off = true; } ret = kvm_vcpu_set_target(vcpu, init); if (ret) return ret; /* * Ensure a rebooted VM will fault in RAM pages and detect if the * guest MMU is turned off and flush the caches as needed. * * S2FWB enforces all memory accesses to RAM being cacheable, * ensuring that the data side is always coherent. We still * need to invalidate the I-cache though, as FWB does *not* * imply CTR_EL0.DIC. */ if (vcpu_has_run_once(vcpu)) { if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) stage2_unmap_vm(vcpu->kvm); else icache_inval_all_pou(); } vcpu_reset_hcr(vcpu); /* * Handle the "start in power-off" case. */ spin_lock(&vcpu->arch.mp_state_lock); if (power_off) __kvm_arm_vcpu_power_off(vcpu); else WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); spin_unlock(&vcpu->arch.mp_state_lock); return 0; } static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret = -ENXIO; switch (attr->group) { default: ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr); break; } return ret; } static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret = -ENXIO; switch (attr->group) { default: ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr); break; } return ret; } static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) { int ret = -ENXIO; switch (attr->group) { default: ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr); break; } return ret; } static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { memset(events, 0, sizeof(*events)); return __kvm_arm_vcpu_get_events(vcpu, events); } static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { int i; /* check whether the reserved field is zero */ for (i = 0; i < ARRAY_SIZE(events->reserved); i++) if (events->reserved[i]) return -EINVAL; /* check whether the pad field is zero */ for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++) if (events->exception.pad[i]) return -EINVAL; return __kvm_arm_vcpu_set_events(vcpu, events); } long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; struct kvm_device_attr attr; long r; switch (ioctl) { case KVM_ARM_VCPU_INIT: { struct kvm_vcpu_init init; r = -EFAULT; if (copy_from_user(&init, argp, sizeof(init))) break; r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); break; } case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) break; r = -EFAULT; if (copy_from_user(&reg, argp, sizeof(reg))) break; /* * We could owe a reset due to PSCI. Handle the pending reset * here to ensure userspace register accesses are ordered after * the reset. */ if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) kvm_reset_vcpu(vcpu); if (ioctl == KVM_SET_ONE_REG) r = kvm_arm_set_reg(vcpu, &reg); else r = kvm_arm_get_reg(vcpu, &reg); break; } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; struct kvm_reg_list reg_list; unsigned n; r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) break; r = -EPERM; if (!kvm_arm_vcpu_is_finalized(vcpu)) break; r = -EFAULT; if (copy_from_user(&reg_list, user_list, sizeof(reg_list))) break; n = reg_list.n; reg_list.n = kvm_arm_num_regs(vcpu); if (copy_to_user(user_list, &reg_list, sizeof(reg_list))) break; r = -E2BIG; if (n < reg_list.n) break; r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); break; } case KVM_SET_DEVICE_ATTR: { r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) break; r = kvm_arm_vcpu_set_attr(vcpu, &attr); break; } case KVM_GET_DEVICE_ATTR: { r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) break; r = kvm_arm_vcpu_get_attr(vcpu, &attr); break; } case KVM_HAS_DEVICE_ATTR: { r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) break; r = kvm_arm_vcpu_has_attr(vcpu, &attr); break; } case KVM_GET_VCPU_EVENTS: { struct kvm_vcpu_events events; if (kvm_arm_vcpu_get_events(vcpu, &events)) return -EINVAL; if (copy_to_user(argp, &events, sizeof(events))) return -EFAULT; return 0; } case KVM_SET_VCPU_EVENTS: { struct kvm_vcpu_events events; if (copy_from_user(&events, argp, sizeof(events))) return -EFAULT; return kvm_arm_vcpu_set_events(vcpu, &events); } case KVM_ARM_VCPU_FINALIZE: { int what; if (!kvm_vcpu_initialized(vcpu)) return -ENOEXEC; if (get_user(what, (const int __user *)argp)) return -EFAULT; return kvm_arm_vcpu_finalize(vcpu, what); } default: r = -EINVAL; } return r; } void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { } static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) { case KVM_ARM_DEVICE_VGIC_V2: if (!vgic_present) return -ENXIO; return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr); default: return -ENODEV; } } static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_ARM_VM_SMCCC_CTRL: return kvm_vm_smccc_has_attr(kvm, attr); default: return -ENXIO; } } static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr) { switch (attr->group) { case KVM_ARM_VM_SMCCC_CTRL: return kvm_vm_smccc_set_attr(kvm, attr); default: return -ENXIO; } } int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; struct kvm_device_attr attr; switch (ioctl) { case KVM_CREATE_IRQCHIP: { int ret; if (!vgic_present) return -ENXIO; mutex_lock(&kvm->lock); ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); mutex_unlock(&kvm->lock); return ret; } case KVM_ARM_SET_DEVICE_ADDR: { struct kvm_arm_device_addr dev_addr; if (copy_from_user(&dev_addr, argp, sizeof(dev_addr))) return -EFAULT; return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); } case KVM_ARM_PREFERRED_TARGET: { struct kvm_vcpu_init init = { .target = KVM_ARM_TARGET_GENERIC_V8, }; if (copy_to_user(argp, &init, sizeof(init))) return -EFAULT; return 0; } case KVM_ARM_MTE_COPY_TAGS: { struct kvm_arm_copy_mte_tags copy_tags; if (copy_from_user(&copy_tags, argp, sizeof(copy_tags))) return -EFAULT; return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags); } case KVM_ARM_SET_COUNTER_OFFSET: { struct kvm_arm_counter_offset offset; if (copy_from_user(&offset, argp, sizeof(offset))) return -EFAULT; return kvm_vm_ioctl_set_counter_offset(kvm, &offset); } case KVM_HAS_DEVICE_ATTR: { if (copy_from_user(&attr, argp, sizeof(attr))) return -EFAULT; return kvm_vm_has_attr(kvm, &attr); } case KVM_SET_DEVICE_ATTR: { if (copy_from_user(&attr, argp, sizeof(attr))) return -EFAULT; return kvm_vm_set_attr(kvm, &attr); } case KVM_ARM_GET_REG_WRITABLE_MASKS: { struct reg_mask_range range; if (copy_from_user(&range, argp, sizeof(range))) return -EFAULT; return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range); } default: return -EINVAL; } } static unsigned long nvhe_percpu_size(void) { return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) - (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start); } static unsigned long nvhe_percpu_order(void) { unsigned long size = nvhe_percpu_size(); return size ? get_order(size) : 0; } static size_t pkvm_host_sve_state_order(void) { return get_order(pkvm_host_sve_state_size()); } /* A lookup table holding the hypervisor VA for each vector slot */ static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) { hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot); } static int kvm_init_vector_slots(void) { int err; void *base; base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector)); kvm_init_vector_slot(base, HYP_VECTOR_DIRECT); base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs)); kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT); if (kvm_system_needs_idmapped_vectors() && !is_protected_kvm_enabled()) { err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs), __BP_HARDEN_HYP_VECS_SZ, &base); if (err) return err; } kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT); kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT); return 0; } static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); unsigned long tcr; /* * Calculate the raw per-cpu offset without a translation from the * kernel's mapping to the linear mapping, and store it in tpidr_el2 * so that we can use adr_l to access per-cpu variables in EL2. * Also drop the KASAN tag which gets in the way... */ params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) - (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); params->mair_el2 = read_sysreg(mair_el1); tcr = read_sysreg(tcr_el1); if (cpus_have_final_cap(ARM64_KVM_HVHE)) { tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); tcr |= TCR_EPD1_MASK; } else { unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); tcr &= TCR_EL2_MASK; tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); if (lpa2_is_enabled()) tcr |= TCR_EL2_DS; } tcr |= TCR_T0SZ(hyp_va_bits); params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); if (is_protected_kvm_enabled()) params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; else params->hcr_el2 = HCR_HOST_NVHE_FLAGS; if (cpus_have_final_cap(ARM64_KVM_HVHE)) params->hcr_el2 |= HCR_E2H; params->vttbr = params->vtcr = 0; /* * Flush the init params from the data cache because the struct will * be read while the MMU is off. */ kvm_flush_dcache_to_poc(params, sizeof(*params)); } static void hyp_install_host_vector(void) { struct kvm_nvhe_init_params *params; struct arm_smccc_res res; /* Switch from the HYP stub to our own HYP init vector */ __hyp_set_vectors(kvm_get_idmap_vector()); /* * Call initialization code, and switch to the full blown HYP code. * If the cpucaps haven't been finalized yet, something has gone very * wrong, and hyp will crash and burn when it uses any * cpus_have_*_cap() wrapper. */ BUG_ON(!system_capabilities_finalized()); params = this_cpu_ptr_nvhe_sym(kvm_init_params); arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); WARN_ON(res.a0 != SMCCC_RET_SUCCESS); } static void cpu_init_hyp_mode(void) { hyp_install_host_vector(); /* * Disabling SSBD on a non-VHE system requires us to enable SSBS * at EL2. */ if (this_cpu_has_cap(ARM64_SSBS) && arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) { kvm_call_hyp_nvhe(__kvm_enable_ssbs); } } static void cpu_hyp_reset(void) { if (!is_kernel_in_hyp_mode()) __hyp_reset_vectors(); } /* * EL2 vectors can be mapped and rerouted in a number of ways, * depending on the kernel configuration and CPU present: * * - If the CPU is affected by Spectre-v2, the hardening sequence is * placed in one of the vector slots, which is executed before jumping * to the real vectors. * * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot * containing the hardening sequence is mapped next to the idmap page, * and executed before jumping to the real vectors. * * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an * empty slot is selected, mapped next to the idmap page, and * executed before jumping to the real vectors. * * Note that ARM64_SPECTRE_V3A is somewhat incompatible with * VHE, as we don't have hypervisor-specific mappings. If the system * is VHE and yet selects this capability, it will be ignored. */ static void cpu_set_hyp_vector(void) { struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); void *vector = hyp_spectre_vector_selector[data->slot]; if (!is_protected_kvm_enabled()) *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; else kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); } static void cpu_hyp_init_context(void) { kvm_init_host_cpu_context(host_data_ptr(host_ctxt)); kvm_init_host_debug_data(); if (!is_kernel_in_hyp_mode()) cpu_init_hyp_mode(); } static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); if (vgic_present) kvm_vgic_init_cpu_hardware(); } static void cpu_hyp_reinit(void) { cpu_hyp_reset(); cpu_hyp_init_context(); cpu_hyp_init_features(); } static void cpu_hyp_init(void *discard) { if (!__this_cpu_read(kvm_hyp_initialized)) { cpu_hyp_reinit(); __this_cpu_write(kvm_hyp_initialized, 1); } } static void cpu_hyp_uninit(void *discard) { if (__this_cpu_read(kvm_hyp_initialized)) { cpu_hyp_reset(); __this_cpu_write(kvm_hyp_initialized, 0); } } int kvm_arch_enable_virtualization_cpu(void) { /* * Most calls to this function are made with migration * disabled, but not with preemption disabled. The former is * enough to ensure correctness, but most of the helpers * expect the later and will throw a tantrum otherwise. */ preempt_disable(); cpu_hyp_init(NULL); kvm_vgic_cpu_up(); kvm_timer_cpu_up(); preempt_enable(); return 0; } void kvm_arch_disable_virtualization_cpu(void) { kvm_timer_cpu_down(); kvm_vgic_cpu_down(); if (!is_protected_kvm_enabled()) cpu_hyp_uninit(NULL); } #ifdef CONFIG_CPU_PM static int hyp_init_cpu_pm_notifier(struct notifier_block *self, unsigned long cmd, void *v) { /* * kvm_hyp_initialized is left with its old value over * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should * re-enable hyp. */ switch (cmd) { case CPU_PM_ENTER: if (__this_cpu_read(kvm_hyp_initialized)) /* * don't update kvm_hyp_initialized here * so that the hyp will be re-enabled * when we resume. See below. */ cpu_hyp_reset(); return NOTIFY_OK; case CPU_PM_ENTER_FAILED: case CPU_PM_EXIT: if (__this_cpu_read(kvm_hyp_initialized)) /* The hyp was enabled before suspend. */ cpu_hyp_reinit(); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block hyp_init_cpu_pm_nb = { .notifier_call = hyp_init_cpu_pm_notifier, }; static void __init hyp_cpu_pm_init(void) { if (!is_protected_kvm_enabled()) cpu_pm_register_notifier(&hyp_init_cpu_pm_nb); } static void __init hyp_cpu_pm_exit(void) { if (!is_protected_kvm_enabled()) cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb); } #else static inline void __init hyp_cpu_pm_init(void) { } static inline void __init hyp_cpu_pm_exit(void) { } #endif static void __init init_cpu_logical_map(void) { unsigned int cpu; /* * Copy the MPIDR <-> logical CPU ID mapping to hyp. * Only copy the set of online CPUs whose features have been checked * against the finalized system capabilities. The hypervisor will not * allow any other CPUs from the `possible` set to boot. */ for_each_online_cpu(cpu) hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu); } #define init_psci_0_1_impl_state(config, what) \ config.psci_0_1_ ## what ## _implemented = psci_ops.what static bool __init init_psci_relay(void) { /* * If PSCI has not been initialized, protected KVM cannot install * itself on newly booted CPUs. */ if (!psci_ops.get_version) { kvm_err("Cannot initialize protected mode without PSCI\n"); return false; } kvm_host_psci_config.version = psci_ops.get_version(); kvm_host_psci_config.smccc_version = arm_smccc_get_version(); if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend); init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on); init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off); init_psci_0_1_impl_state(kvm_host_psci_config, migrate); } return true; } static int __init init_subsystems(void) { int err = 0; /* * Enable hardware so that subsystem initialisation can access EL2. */ on_each_cpu(cpu_hyp_init, NULL, 1); /* * Register CPU lower-power notifier */ hyp_cpu_pm_init(); /* * Init HYP view of VGIC */ err = kvm_vgic_hyp_init(); switch (err) { case 0: vgic_present = true; break; case -ENODEV: case -ENXIO: /* * No VGIC? No pKVM for you. * * Protected mode assumes that VGICv3 is present, so no point * in trying to hobble along if vgic initialization fails. */ if (is_protected_kvm_enabled()) goto out; /* * Otherwise, userspace could choose to implement a GIC for its * guest on non-cooperative hardware. */ vgic_present = false; err = 0; break; default: goto out; } if (kvm_mode == KVM_MODE_NV && !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) { kvm_err("NV support requires GICv3, giving up\n"); err = -EINVAL; goto out; } /* * Init HYP architected timer support */ err = kvm_timer_hyp_init(vgic_present); if (err) goto out; kvm_register_perf_callbacks(NULL); out: if (err) hyp_cpu_pm_exit(); if (err || !is_protected_kvm_enabled()) on_each_cpu(cpu_hyp_uninit, NULL, 1); return err; } static void __init teardown_subsystems(void) { kvm_unregister_perf_callbacks(); hyp_cpu_pm_exit(); } static void __init teardown_hyp_mode(void) { bool free_sve = system_supports_sve() && is_protected_kvm_enabled(); int cpu; free_hyp_pgds(); for_each_possible_cpu(cpu) { free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT); free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); if (free_sve) { struct cpu_sve_state *sve_state; sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; free_pages((unsigned long) sve_state, pkvm_host_sve_state_order()); } } } static int __init do_pkvm_init(u32 hyp_va_bits) { void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)); int ret; preempt_disable(); cpu_hyp_init_context(); ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, num_possible_cpus(), kern_hyp_va(per_cpu_base), hyp_va_bits); cpu_hyp_init_features(); /* * The stub hypercalls are now disabled, so set our local flag to * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu(). */ __this_cpu_write(kvm_hyp_initialized, 1); preempt_enable(); return ret; } static u64 get_hyp_id_aa64pfr0_el1(void) { /* * Track whether the system isn't affected by spectre/meltdown in the * hypervisor's view of id_aa64pfr0_el1, used for protected VMs. * Although this is per-CPU, we make it global for simplicity, e.g., not * to have to worry about vcpu migration. * * Unlike for non-protected VMs, userspace cannot override this for * protected VMs. */ u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED); val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), arm64_get_meltdown_state() == SPECTRE_UNAFFECTED); return val; } static void kvm_hyp_init_symbols(void) { kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1(); kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1); kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1); kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1); kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1); kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1); kvm_nvhe_sym(__icache_flags) = __icache_flags; kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits; /* Propagate the FGT state to the the nVHE side */ kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks; kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks; kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks; kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks; kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks; kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks; kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks; kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks; kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks; kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks; kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks; /* * Flush entire BSS since part of its data containing init symbols is read * while the MMU is off. */ kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start), kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start)); } static int __init kvm_hyp_init_protection(u32 hyp_va_bits) { void *addr = phys_to_virt(hyp_mem_base); int ret; ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); if (ret) return ret; ret = do_pkvm_init(hyp_va_bits); if (ret) return ret; free_hyp_pgds(); return 0; } static int init_pkvm_host_sve_state(void) { int cpu; if (!system_supports_sve()) return 0; /* Allocate pages for host sve state in protected mode. */ for_each_possible_cpu(cpu) { struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order()); if (!page) return -ENOMEM; per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page); } /* * Don't map the pages in hyp since these are only used in protected * mode, which will (re)create its own mapping when initialized. */ return 0; } /* * Finalizes the initialization of hyp mode, once everything else is initialized * and the initialziation process cannot fail. */ static void finalize_init_hyp_mode(void) { int cpu; if (system_supports_sve() && is_protected_kvm_enabled()) { for_each_possible_cpu(cpu) { struct cpu_sve_state *sve_state; sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state; per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = kern_hyp_va(sve_state); } } } static void pkvm_hyp_init_ptrauth(void) { struct kvm_cpu_context *hyp_ctxt; int cpu; for_each_possible_cpu(cpu) { hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu); hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long(); hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long(); hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long(); hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long(); hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long(); hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long(); hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long(); hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long(); hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long(); hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long(); } } /* Inits Hyp-mode on all online CPUs */ static int __init init_hyp_mode(void) { u32 hyp_va_bits; int cpu; int err = -ENOMEM; /* * The protected Hyp-mode cannot be initialized if the memory pool * allocation has failed. */ if (is_protected_kvm_enabled() && !hyp_mem_base) goto out_err; /* * Allocate Hyp PGD and setup Hyp identity mapping */ err = kvm_mmu_init(&hyp_va_bits); if (err) goto out_err; /* * Allocate stack pages for Hypervisor-mode */ for_each_possible_cpu(cpu) { unsigned long stack_base; stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT); if (!stack_base) { err = -ENOMEM; goto out_err; } per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base; } /* * Allocate and initialize pages for Hypervisor-mode percpu regions. */ for_each_possible_cpu(cpu) { struct page *page; void *page_addr; page = alloc_pages(GFP_KERNEL, nvhe_percpu_order()); if (!page) { err = -ENOMEM; goto out_err; } page_addr = page_address(page); memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size()); kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr; } /* * Map the Hyp-code called directly from the host */ err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start), kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC); if (err) { kvm_err("Cannot map world-switch code\n"); goto out_err; } err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start), kvm_ksym_ref(__hyp_data_end), PAGE_HYP); if (err) { kvm_err("Cannot map .hyp.data section\n"); goto out_err; } err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start), kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO); if (err) { kvm_err("Cannot map .hyp.rodata section\n"); goto out_err; } err = create_hyp_mappings(kvm_ksym_ref(__start_rodata), kvm_ksym_ref(__end_rodata), PAGE_HYP_RO); if (err) { kvm_err("Cannot map rodata section\n"); goto out_err; } /* * .hyp.bss is guaranteed to be placed at the beginning of the .bss * section thanks to an assertion in the linker script. Map it RW and * the rest of .bss RO. */ err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start), kvm_ksym_ref(__hyp_bss_end), PAGE_HYP); if (err) { kvm_err("Cannot map hyp bss section: %d\n", err); goto out_err; } err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end), kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); if (err) { kvm_err("Cannot map bss section\n"); goto out_err; } /* * Map the Hyp stack pages */ for_each_possible_cpu(cpu) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu); err = create_hyp_stack(__pa(stack_base), &params->stack_hyp_va); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; } /* * Save the stack PA in nvhe_init_params. This will be needed * to recreate the stack mapping in protected nVHE mode. * __hyp_pa() won't do the right thing there, since the stack * has been mapped in the flexible private VA space. */ params->stack_pa = __pa(stack_base); } for_each_possible_cpu(cpu) { char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu]; char *percpu_end = percpu_begin + nvhe_percpu_size(); /* Map Hyp percpu pages */ err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); if (err) { kvm_err("Cannot map hyp percpu region\n"); goto out_err; } /* Prepare the CPU initialization parameters */ cpu_prepare_hyp_mode(cpu, hyp_va_bits); } kvm_hyp_init_symbols(); if (is_protected_kvm_enabled()) { if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH)) pkvm_hyp_init_ptrauth(); init_cpu_logical_map(); if (!init_psci_relay()) { err = -ENODEV; goto out_err; } err = init_pkvm_host_sve_state(); if (err) goto out_err; err = kvm_hyp_init_protection(hyp_va_bits); if (err) { kvm_err("Failed to init hyp memory protection\n"); goto out_err; } } return 0; out_err: teardown_hyp_mode(); kvm_err("error initializing Hyp mode: %d\n", err); return err; } struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { struct kvm_vcpu *vcpu = NULL; struct kvm_mpidr_data *data; unsigned long i; mpidr &= MPIDR_HWID_BITMASK; rcu_read_lock(); data = rcu_dereference(kvm->arch.mpidr_data); if (data) { u16 idx = kvm_mpidr_index(data, mpidr); vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]); if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu)) vcpu = NULL; } rcu_read_unlock(); if (vcpu) return vcpu; kvm_for_each_vcpu(i, vcpu, kvm) { if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu)) return vcpu; } return NULL; } bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) { return irqchip_in_kernel(kvm); } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; /* * The only thing we have a chance of directly-injecting is LPIs. Maybe * one day... */ if (irq_entry->type != KVM_IRQ_ROUTING_MSI) return 0; return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, &irqfd->irq_entry); } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry; if (irq_entry->type != KVM_IRQ_ROUTING_MSI) return; kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq); } bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { if (old->type != KVM_IRQ_ROUTING_MSI || new->type != KVM_IRQ_ROUTING_MSI) return true; return memcmp(&old->msi, &new->msi, sizeof(new->msi)); } int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { /* * Remapping the vLPI requires taking the its_lock mutex to resolve * the new translation. We're in spinlock land at this point, so no * chance of resolving the translation. * * Unmap the vLPI and fall back to software LPI injection. */ return kvm_vgic_v4_unset_forwarding(kvm, host_irq); } void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); kvm_arm_halt_guest(irqfd->kvm); } void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); kvm_arm_resume_guest(irqfd->kvm); } /* Initialize Hyp-mode and memory mappings on all CPUs */ static __init int kvm_arm_init(void) { int err; bool in_hyp_mode; if (!is_hyp_mode_available()) { kvm_info("HYP mode not available\n"); return -ENODEV; } if (kvm_get_mode() == KVM_MODE_NONE) { kvm_info("KVM disabled from command line\n"); return -ENODEV; } err = kvm_sys_reg_table_init(); if (err) { kvm_info("Error initializing system register tables"); return err; } in_hyp_mode = is_kernel_in_hyp_mode(); if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || cpus_have_final_cap(ARM64_WORKAROUND_1508412)) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ "Only trusted guests should be used on this system.\n"); err = kvm_set_ipa_limit(); if (err) return err; err = kvm_arm_init_sve(); if (err) return err; err = kvm_arm_vmid_alloc_init(); if (err) { kvm_err("Failed to initialize VMID allocator.\n"); return err; } if (!in_hyp_mode) { err = init_hyp_mode(); if (err) goto out_err; } err = kvm_init_vector_slots(); if (err) { kvm_err("Cannot initialise vector slots\n"); goto out_hyp; } err = init_subsystems(); if (err) goto out_hyp; kvm_info("%s%sVHE%s mode initialized successfully\n", in_hyp_mode ? "" : (is_protected_kvm_enabled() ? "Protected " : "Hyp "), in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ? "h" : "n"), cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": ""); /* * FIXME: Do something reasonable if kvm_init() fails after pKVM * hypervisor protection is finalized. */ err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (err) goto out_subs; /* * This should be called after initialization is done and failure isn't * possible anymore. */ if (!in_hyp_mode) finalize_init_hyp_mode(); kvm_arm_initialised = true; return 0; out_subs: teardown_subsystems(); out_hyp: if (!in_hyp_mode) teardown_hyp_mode(); out_err: kvm_arm_vmid_alloc_free(); return err; } static int __init early_kvm_mode_cfg(char *arg) { if (!arg) return -EINVAL; if (strcmp(arg, "none") == 0) { kvm_mode = KVM_MODE_NONE; return 0; } if (!is_hyp_mode_available()) { pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n"); return 0; } if (strcmp(arg, "protected") == 0) { if (!is_kernel_in_hyp_mode()) kvm_mode = KVM_MODE_PROTECTED; else pr_warn_once("Protected KVM not available with VHE\n"); return 0; } if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) { kvm_mode = KVM_MODE_DEFAULT; return 0; } if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) { kvm_mode = KVM_MODE_NV; return 0; } return -EINVAL; } early_param("kvm-arm.mode", early_kvm_mode_cfg); static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p) { if (!arg) return -EINVAL; if (strcmp(arg, "trap") == 0) { *p = KVM_WFX_TRAP; return 0; } if (strcmp(arg, "notrap") == 0) { *p = KVM_WFX_NOTRAP; return 0; } return -EINVAL; } static int __init early_kvm_wfi_trap_policy_cfg(char *arg) { return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy); } early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg); static int __init early_kvm_wfe_trap_policy_cfg(char *arg) { return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy); } early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg); enum kvm_mode kvm_get_mode(void) { return kvm_mode; } module_init(kvm_arm_init);
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* SPDX-License-Identifier: GPL-2.0 */ /* * A security context is a set of security attributes * associated with each subject and object controlled * by the security policy. Security contexts are * externally represented as variable-length strings * that can be interpreted by a user or application * with an understanding of the security policy. * Internally, the security server uses a simple * structure. This structure is private to the * security server and can be changed without affecting * clients of the security server. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ #ifndef _SS_CONTEXT_H_ #define _SS_CONTEXT_H_ #include "ebitmap.h" #include "mls_types.h" #include "security.h" /* * A security context consists of an authenticated user * identity, a role, a type and a MLS range. */ struct context { u32 user; u32 role; u32 type; u32 len; /* length of string in bytes */ struct mls_range range; char *str; /* string representation if context cannot be mapped. */ }; static inline void mls_context_init(struct context *c) { memset(&c->range, 0, sizeof(c->range)); } static inline int mls_context_cpy(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the low level of 'src'. */ static inline int mls_context_cpy_low(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the high level of 'src'. */ static inline int mls_context_cpy_high(struct context *dst, const struct context *src) { int rc; dst->range.level[0].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } static inline int mls_context_glblub(struct context *dst, const struct context *c1, const struct context *c2) { struct mls_range *dr = &dst->range; const struct mls_range *r1 = &c1->range, *r2 = &c2->range; int rc = 0; if (r1->level[1].sens < r2->level[0].sens || r2->level[1].sens < r1->level[0].sens) /* These ranges have no common sensitivities */ return -EINVAL; /* Take the greatest of the low */ dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens); /* Take the least of the high */ dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens); rc = ebitmap_and(&dr->level[0].cat, &r1->level[0].cat, &r2->level[0].cat); if (rc) goto out; rc = ebitmap_and(&dr->level[1].cat, &r1->level[1].cat, &r2->level[1].cat); if (rc) goto out; out: return rc; } static inline bool mls_context_equal(const struct context *c1, const struct context *c2) { return ((c1->range.level[0].sens == c2->range.level[0].sens) && ebitmap_equal(&c1->range.level[0].cat, &c2->range.level[0].cat) && (c1->range.level[1].sens == c2->range.level[1].sens) && ebitmap_equal(&c1->range.level[1].cat, &c2->range.level[1].cat)); } static inline void mls_context_destroy(struct context *c) { ebitmap_destroy(&c->range.level[0].cat); ebitmap_destroy(&c->range.level[1].cat); mls_context_init(c); } static inline void context_init(struct context *c) { memset(c, 0, sizeof(*c)); } static inline int context_cpy(struct context *dst, const struct context *src) { int rc; dst->user = src->user; dst->role = src->role; dst->type = src->type; if (src->str) { dst->str = kstrdup(src->str, GFP_ATOMIC); if (!dst->str) return -ENOMEM; dst->len = src->len; } else { dst->str = NULL; dst->len = 0; } rc = mls_context_cpy(dst, src); if (rc) { kfree(dst->str); dst->str = NULL; dst->len = 0; return rc; } return 0; } static inline void context_destroy(struct context *c) { c->user = c->role = c->type = 0; kfree(c->str); c->str = NULL; c->len = 0; mls_context_destroy(c); } static inline bool context_equal(const struct context *c1, const struct context *c2) { if (c1->len && c2->len) return (c1->len == c2->len && !strcmp(c1->str, c2->str)); if (c1->len || c2->len) return 0; return ((c1->user == c2->user) && (c1->role == c2->role) && (c1->type == c2->type) && mls_context_equal(c1, c2)); } u32 context_compute_hash(const struct context *c); #endif /* _SS_CONTEXT_H_ */
5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 Forwarding Information Base: semantics. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/netlink.h> #include <linux/hash.h> #include <linux/nospec.h> #include <net/arp.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/nexthop.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/addrconf.h> #include "fib_lookup.h" /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ #ifdef CONFIG_IP_ROUTE_MULTIPATH #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ for (nhsel = 0, nh = (fi)->fib_nh; \ nhsel < fib_info_num_path((fi)); \ nh++, nhsel++) #define change_nexthops(fi) { \ int nhsel; struct fib_nh *nexthop_nh; \ for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ nhsel < fib_info_num_path((fi)); \ nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ /* Hope, that gcc will optimize it to get rid of dummy loop */ #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ for (nhsel = 0; nhsel < 1; nhsel++) #define change_nexthops(fi) { \ int nhsel; \ struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ for (nhsel = 0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ #define endfor_nexthops(fi) } const struct fib_prop fib_props[RTN_MAX + 1] = { [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, }, [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST, }, [RTN_BROADCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_ANYCAST] = { .error = 0, .scope = RT_SCOPE_LINK, }, [RTN_MULTICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, }, [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE, }, [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE, }, [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE, }, [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE, }, [RTN_NAT] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, }, }; static void rt_fibinfo_free(struct rtable __rcu **rtp) { struct rtable *rt = rcu_dereference_protected(*rtp, 1); if (!rt) return; /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); * because we waited an RCU grace period before calling * free_fib_info_rcu() */ dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } static void free_nh_exceptions(struct fib_nh_common *nhc) { struct fnhe_hash_bucket *hash; int i; hash = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!hash) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; fnhe = rcu_dereference_protected(hash[i].chain, 1); while (fnhe) { struct fib_nh_exception *next; next = rcu_dereference_protected(fnhe->fnhe_next, 1); rt_fibinfo_free(&fnhe->fnhe_rth_input); rt_fibinfo_free(&fnhe->fnhe_rth_output); kfree(fnhe); fnhe = next; } } kfree(hash); } static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) { int cpu; if (!rtp) return; for_each_possible_cpu(cpu) { struct rtable *rt; rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); if (rt) { dst_dev_put(&rt->dst); dst_release_immediate(&rt->dst); } } free_percpu(rtp); } void fib_nh_common_release(struct fib_nh_common *nhc) { netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker); lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); free_nh_exceptions(nhc); } EXPORT_SYMBOL_GPL(fib_nh_common_release); void fib_nh_release(struct net *net, struct fib_nh *fib_nh) { #ifdef CONFIG_IP_ROUTE_CLASSID if (fib_nh->nh_tclassid) atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif fib_nh_common_release(&fib_nh->nh_common); } /* Release a nexthop info record */ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); if (fi->nh) { nexthop_put(fi->nh); } else { change_nexthops(fi) { fib_nh_release(fi->fib_net, nexthop_nh); } endfor_nexthops(fi); } ip_fib_metrics_put(fi->fib_metrics); kfree(fi); } void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { pr_warn("Freeing alive fib_info %p\n", fi); return; } call_rcu_hurry(&fi->rcu, free_fib_info_rcu); } EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); if (fi->nh) { list_del(&fi->nh_list); } else { change_nexthops(fi) { if (!nexthop_nh->fib_nh_dev) continue; hlist_del_rcu(&nexthop_nh->nh_hash); } endfor_nexthops(fi) } /* Paired with READ_ONCE() from fib_table_lookup() */ WRITE_ONCE(fi->fib_dead, 1); fib_info_put(fi); } } static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi) { const struct fib_nh *onh; if (fi->nh || ofi->nh) return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1; if (ofi->fib_nhs == 0) return 0; for_nexthops(fi) { onh = fib_info_nh(ofi, nhsel); if (nh->fib_nh_oif != onh->fib_nh_oif || nh->fib_nh_gw_family != onh->fib_nh_gw_family || nh->fib_nh_scope != onh->fib_nh_scope || #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight != onh->fib_nh_weight || #endif #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) return -1; if (nh->fib_nh_gw_family == AF_INET && nh->fib_nh_gw4 != onh->fib_nh_gw4) return -1; if (nh->fib_nh_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6)) return -1; } endfor_nexthops(fi); return 0; } static struct hlist_head *fib_nh_head(struct net_device *dev) { return &dev->fib_nh_head; } static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, u32 prefsrc, u32 priority) { unsigned int val = init_val; val ^= (protocol << 8) | scope; val ^= prefsrc; val ^= priority; return val; } static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, fi->fib_scope, (__force u32)fi->fib_prefsrc, fi->fib_priority); if (fi->nh) { val ^= fi->nh->id; } else { for_nexthops(fi) { val ^= nh->fib_nh_oif; } endfor_nexthops(fi) } return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; } static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, __be32 val) { unsigned int hash_bits = net->ipv4.fib_info_hash_bits; u32 slot; slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; } static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) { /* The second half is used for prefsrc */ return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head), GFP_KERNEL); } static void fib_info_hash_free(struct hlist_head *head) { kvfree(head); } static void fib_info_hash_grow(struct net *net) { unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; struct hlist_head *new_info_hash, *old_info_hash; unsigned int i; if (net->ipv4.fib_info_cnt < old_size) return; new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); if (!new_info_hash) return; old_info_hash = net->ipv4.fib_info_hash; net->ipv4.fib_info_hash = new_info_hash; net->ipv4.fib_info_hash_bits += 1; for (i = 0; i < old_size; i++) { struct hlist_head *head = &old_info_hash[i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, head, fib_hash) hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); } for (i = 0; i < old_size; i++) { struct hlist_head *lhead = &old_info_hash[old_size + i]; struct hlist_node *n; struct fib_info *fi; hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) hlist_add_head(&fi->fib_lhash, fib_info_laddrhash_bucket(fi->fib_net, fi->fib_prefsrc)); } fib_info_hash_free(old_info_hash); } /* no metrics, only nexthop id */ static struct fib_info *fib_find_info_nh(struct net *net, const struct fib_config *cfg) { struct hlist_head *head; struct fib_info *fi; unsigned int hash; hash = fib_info_hashfn_1(cfg->fc_nh_id, cfg->fc_protocol, cfg->fc_scope, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && cfg->fc_priority == fi->fib_priority && cfg->fc_type == fi->fib_type && cfg->fc_table == fi->fib_tb_id && !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK)) return fi; } return NULL; } static struct fib_info *fib_find_info(struct fib_info *nfi) { struct hlist_head *head = fib_info_hash_bucket(nfi); struct fib_info *fi; hlist_for_each_entry(fi, head, fib_hash) { if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && nfi->fib_priority == fi->fib_priority && nfi->fib_type == fi->fib_type && nfi->fib_tb_id == fi->fib_tb_id && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(u32) * RTAX_MAX) == 0 && !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && nh_comp(fi, nfi) == 0) return fi; } return NULL; } /* Check, that the gateway is already configured. * Used only by redirect accept routine, under rcu_read_lock(); */ int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; struct fib_nh *nh; head = fib_nh_head(dev); hlist_for_each_entry_rcu(nh, head, nh_hash) { DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (nh->fib_nh_gw4 == gw && !(nh->fib_nh_flags & RTNH_F_DEAD)) { return 0; } } return -1; } size_t fib_nlmsg_size(struct fib_info *fi) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_DST */ + nla_total_size(4) /* RTA_PRIORITY */ + nla_total_size(4) /* RTA_PREFSRC */ + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ unsigned int nhs = fib_info_num_path(fi); /* space for nested metrics */ payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->nh) payload += nla_total_size(4); /* RTA_NH_ID */ if (nhs) { size_t nh_encapsize = 0; /* Also handles the special case nhs == 1 */ /* each nexthop is packed in an attribute */ size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); unsigned int i; /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); /* grab encap info */ for (i = 0; i < fib_info_num_path(fi); i++) { struct fib_nh_common *nhc = fib_info_nhc(fi, i); if (nhc->nhc_lwtstate) { /* RTA_ENCAP_TYPE */ nh_encapsize += lwtunnel_get_encap_size( nhc->nhc_lwtstate); /* RTA_ENCAP */ nh_encapsize += nla_total_size(2); } } /* all nexthops are packed in a nested attribute */ payload += nla_total_size((nhs * nhsize) + nh_encapsize); } return payload; } void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); if (!skb) goto errout; fri.fi = fa->fa_info; fri.tb_id = tb_id; fri.dst = key; fri.dst_len = dst_len; fri.dscp = fa->fa_dscp; fri.type = fa->fa_type; fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, info->nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); } static int fib_detect_death(struct fib_info *fi, int order, struct fib_info **last_resort, int *last_idx, int dflt) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); struct neighbour *n; int state = NUD_NONE; if (likely(nhc->nhc_gw_family == AF_INET)) n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev); else if (nhc->nhc_gw_family == AF_INET6) n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6, nhc->nhc_dev); else n = NULL; if (n) { state = READ_ONCE(n->nud_state); neigh_release(n); } else { return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } return 1; } int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *encap, u16 encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { int err; nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, gfp_flags); if (!nhc->nhc_pcpu_rth_output) return -ENOMEM; if (encap) { struct lwtunnel_state *lwtstate; if (encap_type == LWTUNNEL_ENCAP_NONE) { NL_SET_ERR_MSG(extack, "LWT encap type not specified"); err = -EINVAL; goto lwt_failure; } err = lwtunnel_build_state(net, encap_type, encap, nhc->nhc_family, cfg, &lwtstate, extack); if (err) goto lwt_failure; nhc->nhc_lwtstate = lwtstate_get(lwtstate); } return 0; lwt_failure: rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); nhc->nhc_pcpu_rth_output = NULL; return err; } EXPORT_SYMBOL_GPL(fib_nh_common_init); int fib_nh_init(struct net *net, struct fib_nh *nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack) { int err; nh->fib_nh_family = AF_INET; err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, GFP_KERNEL, extack); if (err) return err; nh->fib_nh_oif = cfg->fc_oif; nh->fib_nh_gw_family = cfg->fc_gw_family; if (cfg->fc_gw_family == AF_INET) nh->fib_nh_gw4 = cfg->fc_gw4; else if (cfg->fc_gw_family == AF_INET6) nh->fib_nh_gw6 = cfg->fc_gw6; nh->fib_nh_flags = cfg->fc_flags; #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) atomic_inc(&net->ipv4.fib_num_tclassid_users); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight = nh_weight; #endif return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, struct netlink_ext_ack *extack) { int nhs = 0; while (rtnh_ok(rtnh, remaining)) { nhs++; rtnh = rtnh_next(rtnh, &remaining); } /* leftover implies invalid nexthop configuration, discard it */ if (remaining > 0) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthops"); nhs = 0; } return nhs; } static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla, struct netlink_ext_ack *extack) { if (nla_len(nla) < sizeof(*gw)) { NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY"); return -EINVAL; } *gw = nla_get_in_addr(nla); return 0; } /* only called when fib_nh is integrated into fib_info */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { struct net *net = fi->fib_net; struct fib_config fib_cfg; struct fib_nh *nh; int ret; change_nexthops(fi) { int attrlen; memset(&fib_cfg, 0, sizeof(fib_cfg)); if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - extra data after nexthop"); return -EINVAL; } if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); return -EINVAL; } fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; fib_cfg.fc_oif = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla, extack); if (ret) goto errout; if (fib_cfg.fc_gw4) fib_cfg.fc_gw_family = AF_INET; } else if (nlav) { ret = fib_gw_from_via(&fib_cfg, nlav, extack); if (ret) goto errout; } nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } fib_cfg.fc_flow = nla_get_u32(nla); } fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); /* RTA_ENCAP_TYPE length checked in * lwtunnel_valid_encap_type_attr */ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) fib_cfg.fc_encap_type = nla_get_u16(nla); } ret = fib_nh_init(net, nexthop_nh, &fib_cfg, rtnh->rtnh_hops + 1, extack); if (ret) goto errout; rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); ret = -EINVAL; nh = fib_info_nh(fi, 0); if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) { NL_SET_ERR_MSG(extack, "Nexthop device index does not match RTA_OIF"); goto errout; } if (cfg->fc_gw_family) { if (cfg->fc_gw_family != nh->fib_nh_gw_family || (cfg->fc_gw_family == AF_INET && nh->fib_nh_gw4 != cfg->fc_gw4) || (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) { NL_SET_ERR_MSG(extack, "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA"); goto errout; } } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) { NL_SET_ERR_MSG(extack, "Nexthop class id does not match RTA_FLOW"); goto errout; } #endif ret = 0; errout: return ret; } /* only called when fib_nh is integrated into fib_info */ static void fib_rebalance(struct fib_info *fi) { int total; int w; if (fib_info_num_path(fi) < 2) return; total = 0; for_nexthops(fi) { if (nh->fib_nh_flags & RTNH_F_DEAD) continue; if (ip_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN) continue; total += nh->fib_nh_weight; } endfor_nexthops(fi); w = 0; change_nexthops(fi) { int upper_bound; if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { upper_bound = -1; } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { upper_bound = -1; } else { w += nexthop_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; } atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); } endfor_nexthops(fi); } #else /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); return -EINVAL; } #define fib_rebalance(fi) do { } while (0) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, const struct fib_nh *nh, const struct fib_config *cfg, struct netlink_ext_ack *extack) { struct lwtunnel_state *lwtstate; int ret, result = 0; if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; ret = lwtunnel_build_state(net, encap_type, encap, AF_INET, cfg, &lwtstate, extack); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); lwtstate_free(lwtstate); } return result; } int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack) { #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; #endif if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) return 1; if (cfg->fc_nh_id) { if (fi->nh && cfg->fc_nh_id == fi->nh->id) return 0; return 1; } if (fi->nh) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp) return 1; return 0; } if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, nh, cfg, extack)) return 1; } #ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && cfg->fc_flow != nh->nh_tclassid) return 1; #endif if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) || (cfg->fc_gw_family && cfg->fc_gw_family != nh->fib_nh_gw_family)) return 1; if (cfg->fc_gw_family == AF_INET && cfg->fc_gw4 != nh->fib_nh_gw4) return 1; if (cfg->fc_gw_family == AF_INET6 && ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6)) return 1; return 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (!cfg->fc_mp) return 0; rtnh = cfg->fc_mp; remaining = cfg->fc_mp_len; for_nexthops(fi) { int attrlen; if (!rtnh_ok(rtnh, remaining)) return -EINVAL; if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) return 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh); int err; nla = nla_find(attrs, attrlen, RTA_GATEWAY); nlav = nla_find(attrs, attrlen, RTA_VIA); if (nla && nlav) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); return -EINVAL; } if (nla) { __be32 gw; err = fib_gw_from_attr(&gw, nla, extack); if (err) return err; if (nh->fib_nh_gw_family != AF_INET || gw != nh->fib_nh_gw4) return 1; } else if (nlav) { struct fib_config cfg2; err = fib_gw_from_via(&cfg2, nlav, extack); if (err) return err; switch (nh->fib_nh_gw_family) { case AF_INET: if (cfg2.fc_gw_family != AF_INET || cfg2.fc_gw4 != nh->fib_nh_gw4) return 1; break; case AF_INET6: if (cfg2.fc_gw_family != AF_INET6 || ipv6_addr_cmp(&cfg2.fc_gw6, &nh->fib_nh_gw6)) return 1; break; } } #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla) { if (nla_len(nla) < sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW"); return -EINVAL; } if (nla_get_u32(nla) != nh->nh_tclassid) return 1; } #endif } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); #endif return 0; } bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) { struct nlattr *nla; int remaining; if (!cfg->fc_mx) return true; nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); u32 fi_val, val; if (!type) continue; if (type > RTAX_MAX) return false; type = array_index_nospec(type, RTAX_MAX + 1); if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) return false; val = nla_get_u32(nla); } fi_val = fi->fib_metrics->metrics[type - 1]; if (type == RTAX_FEATURES) fi_val &= ~DST_FEATURE_ECN_CA; if (fi_val != val) return false; } return true; } static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh, u32 table, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = table, .fc_flags = nh->fib_nh_flags | RTF_GATEWAY, .fc_ifindex = nh->fib_nh_oif, .fc_gateway = nh->fib_nh_gw6, }; struct fib6_nh fib6_nh = {}; int err; err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack); if (!err) { nh->fib_nh_dev = fib6_nh.fib_nh_dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); nh->fib_nh_oif = nh->fib_nh_dev->ifindex; nh->fib_nh_scope = RT_SCOPE_LINK; ipv6_stub->fib6_nh_release(&fib6_nh); } return err; } /* * Picture * ------- * * Semantics of nexthop is very messy by historical reasons. * We have to take into account, that: * a) gateway can be actually local interface address, * so that gatewayed route is direct. * b) gateway must be on-link address, possibly * described not by an ifaddr, but also by a direct route. * c) If both gateway and interface are specified, they should not * contradict. * d) If we use tunnel routes, gateway could be not on-link. * * Attempt to reconcile all of these (alas, self-contradictory) conditions * results in pretty ugly and hairy code with obscure logic. * * I chose to generalized it instead, so that the size * of code does not increase practically, but it becomes * much more general. * Every prefix is assigned a "scope" value: "host" is local address, * "link" is direct route, * [ ... "site" ... "interior" ... ] * and "universe" is true gateway route with global meaning. * * Every prefix refers to a set of "nexthop"s (gw, oif), * where gw must have narrower scope. This recursion stops * when gw has LOCAL scope or if "nexthop" is declared ONLINK, * which means that gw is forced to be on link. * * Code is still hairy, but now it is apparently logically * consistent and very flexible. F.e. as by-product it allows * to co-exists in peace independent exterior and interior * routing processes. * * Normally it looks as following. * * {universe prefix} -> (gw, oif) [scope link] * | * |-> {link prefix} -> (gw, oif) [scope local] * | * |-> {local prefix} (terminal node) */ static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { struct net_device *dev; struct fib_result res; int err = 0; if (nh->fib_nh_flags & RTNH_F_ONLINK) { unsigned int addr_type; if (scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Nexthop has invalid scope"); return -EINVAL; } dev = __dev_get_by_index(net, nh->fib_nh_oif); if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); return -ENODEV; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); return -ENETDOWN; } addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4); if (addr_type != RTN_UNICAST) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); return -EINVAL; } if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; nh->fib_nh_dev = dev; netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_LINK; return 0; } rcu_read_lock(); { struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->fib_nh_gw4, .flowi4_scope = scope + 1, .flowi4_oif = nh->fib_nh_oif, .flowi4_iif = LOOPBACK_IFINDEX, }; /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; if (table && table != RT_TABLE_MAIN) tbl = fib_get_table(net, table); if (tbl) err = fib_table_lookup(tbl, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE | FIB_LOOKUP_NOREF); /* on error or if no table given do full lookup. This * is needed for example when nexthops are in the local * table rather than the given table */ if (!tbl || err) { err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE); } if (err) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } } err = -EINVAL; if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); goto out; } nh->fib_nh_scope = res.scope; nh->fib_nh_oif = FIB_RES_OIF(res); nh->fib_nh_dev = dev = FIB_RES_DEV(res); if (!dev) { NL_SET_ERR_MSG(extack, "No egress device for nexthop gateway"); goto out; } netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); if (!netif_carrier_ok(dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; out: rcu_read_unlock(); return err; } static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, struct netlink_ext_ack *extack) { struct in_device *in_dev; int err; if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { NL_SET_ERR_MSG(extack, "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); return -EINVAL; } rcu_read_lock(); err = -ENODEV; in_dev = inetdev_by_index(net, nh->fib_nh_oif); if (!in_dev) goto out; err = -ENETDOWN; if (!(in_dev->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); goto out; } nh->fib_nh_dev = in_dev->dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); nh->fib_nh_scope = RT_SCOPE_HOST; if (!netif_carrier_ok(nh->fib_nh_dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = 0; out: rcu_read_unlock(); return err; } int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack) { int err; if (nh->fib_nh_gw_family == AF_INET) err = fib_check_nh_v4_gw(net, nh, table, scope, extack); else if (nh->fib_nh_gw_family == AF_INET6) err = fib_check_nh_v6_gw(net, nh, table, extack); else err = fib_check_nh_nongw(net, nh, extack); return err; } __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { struct fib_nh *nh; __be32 saddr; if (nhc->nhc_family != AF_INET) return inet_select_addr(nhc->nhc_dev, 0, scope); nh = container_of(nhc, struct fib_nh, nh_common); saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope); WRITE_ONCE(nh->nh_saddr, saddr); WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid)); return saddr; } __be32 fib_result_prefsrc(struct net *net, struct fib_result *res) { struct fib_nh_common *nhc = res->nhc; if (res->fi->fib_prefsrc) return res->fi->fib_prefsrc; if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (READ_ONCE(nh->nh_saddr_genid) == atomic_read(&net->ipv4.dev_addr_genid)) return READ_ONCE(nh->nh_saddr); } return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope); } static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { u32 tb_id = cfg->fc_table; int rc; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, tb_id); if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, fib_prefsrc, RT_TABLE_LOCAL); } if (rc != RTN_LOCAL) return false; } return true; } struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack) { int err; struct fib_info *fi = NULL; struct nexthop *nh = NULL; struct fib_info *ofi; int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; ASSERT_RTNL(); if (cfg->fc_type > RTN_MAX) goto err_inval; /* Fast check to catch the most weird cases */ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { NL_SET_ERR_MSG(extack, "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); goto err_inval; } if (cfg->fc_nh_id) { if (!cfg->fc_mx) { fi = fib_find_info_nh(net, cfg); if (fi) { refcount_inc(&fi->fib_treeref); return fi; } } nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } nhs = 0; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); if (nhs == 0) goto err_inval; } #endif fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) { err = -ENOBUFS; goto failure; } fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); kfree(fi); return ERR_PTR(err); } fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; fi->fib_priority = cfg->fc_priority; fi->fib_prefsrc = cfg->fc_prefsrc; fi->fib_type = cfg->fc_type; fi->fib_tb_id = cfg->fc_table; fi->fib_nhs = nhs; if (nh) { if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -EINVAL; } else { err = 0; fi->nh = nh; } } else { change_nexthops(fi) { nexthop_nh->nh_parent = fi; } endfor_nexthops(fi) if (cfg->fc_mp) err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); else err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); } if (err != 0) goto failure; if (fib_props[cfg->fc_type].error) { if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Gateway, device and multipath can not be specified for this route type"); goto err_inval; } goto link_it; } else { switch (cfg->fc_type) { case RTN_UNICAST: case RTN_LOCAL: case RTN_BROADCAST: case RTN_ANYCAST: case RTN_MULTICAST: break; default: NL_SET_ERR_MSG(extack, "Invalid route type"); goto err_inval; } } if (cfg->fc_scope > RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Invalid scope"); goto err_inval; } if (fi->nh) { err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack); if (err) goto failure; } else if (cfg->fc_scope == RT_SCOPE_HOST) { struct fib_nh *nh = fi->fib_nh; /* Local address is added. */ if (nhs != 1) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); goto err_inval; } if (nh->fib_nh_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); goto err_inval; } nh->fib_nh_scope = RT_SCOPE_NOWHERE; nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif); err = -ENODEV; if (!nh->fib_nh_dev) goto failure; netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_KERNEL); } else { int linkdown = 0; change_nexthops(fi) { err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh, cfg->fc_table, cfg->fc_scope, extack); if (err != 0) goto failure; if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) linkdown++; } endfor_nexthops(fi) if (linkdown == fi->fib_nhs) fi->fib_flags |= RTNH_F_LINKDOWN; } if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); goto err_inval; } if (!fi->nh) { change_nexthops(fi) { fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common, fi->fib_scope); if (nexthop_nh->fib_nh_gw_family == AF_INET6) fi->fib_nh_is_v6 = true; } endfor_nexthops(fi) fib_rebalance(fi); } link_it: ofi = fib_find_info(fi); if (ofi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); refcount_inc(&ofi->fib_treeref); return ofi; } refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); net->ipv4.fib_info_cnt++; hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); if (fi->fib_prefsrc) { struct hlist_head *head; head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc); hlist_add_head(&fi->fib_lhash, head); } if (fi->nh) { list_add(&fi->nh_list, &nh->fi_list); } else { change_nexthops(fi) { struct hlist_head *head; if (!nexthop_nh->fib_nh_dev) continue; head = fib_nh_head(nexthop_nh->fib_nh_dev); hlist_add_head_rcu(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) } return fi; err_inval: err = -EINVAL; failure: if (fi) { /* fib_table_lookup() should not see @fi yet. */ fi->fib_dead = 1; free_fib_info(fi); } return ERR_PTR(err); } int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc, u8 rt_family, unsigned char *flags, bool skip_oif) { if (nhc->nhc_flags & RTNH_F_DEAD) *flags |= RTNH_F_DEAD; if (nhc->nhc_flags & RTNH_F_LINKDOWN) { *flags |= RTNH_F_LINKDOWN; rcu_read_lock(); switch (nhc->nhc_family) { case AF_INET: if (ip_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; case AF_INET6: if (ip6_ignore_linkdown(nhc->nhc_dev)) *flags |= RTNH_F_DEAD; break; } rcu_read_unlock(); } switch (nhc->nhc_gw_family) { case AF_INET: if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4)) goto nla_put_failure; break; case AF_INET6: /* if gateway family does not match nexthop family * gateway is encoded as RTA_VIA */ if (rt_family != nhc->nhc_gw_family) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen); } else if (nla_put_in6_addr(skb, RTA_GATEWAY, &nhc->nhc_gw.ipv6) < 0) { goto nla_put_failure; } break; } *flags |= (nhc->nhc_flags & (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP)); if (!skip_oif && nhc->nhc_dev && nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex)) goto nla_put_failure; if (nhc->nhc_lwtstate && lwtunnel_fill_encap(skb, nhc->nhc_lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; unsigned char flags = 0; rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; rtnh->rtnh_hops = nh_weight - 1; rtnh->rtnh_ifindex = dev ? dev->ifindex : 0; if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0) goto nla_put_failure; rtnh->rtnh_flags = flags; if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) goto nla_put_failure; /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; return 0; nla_put_failure: return -EMSGSIZE; } EXPORT_SYMBOL_GPL(fib_add_nexthop); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (unlikely(fi->nh)) { if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0) goto nla_put_failure; goto mp_end; } for_nexthops(fi) { u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID nh_tclassid = nh->nh_tclassid; #endif if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, AF_INET, nh_tclassid) < 0) goto nla_put_failure; } endfor_nexthops(fi); mp_end: nla_nest_end(skb, mp); return 0; nla_put_failure: return -EMSGSIZE; } #else static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) { return 0; } #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, const struct fib_rt_info *fri, unsigned int flags) { unsigned int nhs = fib_info_num_path(fri->fi); struct fib_info *fi = fri->fi; u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp); if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->nh) { if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode)) goto offload; } if (nhs == 1) { const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); unsigned char flags = 0; if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0) goto nla_put_failure; rtm->rtm_flags = flags; #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); if (nh->nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; } #endif } else { if (fib_add_multipath(skb, fi) < 0) goto nla_put_failure; } offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) rtm->rtm_flags |= RTM_F_TRAP; if (fri->offload_failed) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } /* * Update FIB if: * - local address disappeared -> we must delete all the entries * referring to it. * - device went down -> we must shutdown all nexthops going via it. */ int fib_sync_down_addr(struct net_device *dev, __be32 local) { int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; struct net *net = dev_net(dev); struct hlist_head *head; struct fib_info *fi; int ret = 0; if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); hlist_for_each_entry(fi, head, fib_lhash) { if (!net_eq(fi->fib_net, net) || fi->fib_tb_id != tb_id) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; fi->pfsrc_removed = true; ret++; } } return ret; } static int call_fib_nh_notifiers(struct fib_nh *nh, enum fib_event_type event_type) { bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); struct fib_nh_notifier_info info = { .fib_nh = nh, }; switch (event_type) { case FIB_EVENT_NH_ADD: if (nh->fib_nh_flags & RTNH_F_DEAD) break; if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) break; return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); case FIB_EVENT_NH_DEL: if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || (nh->fib_nh_flags & RTNH_F_DEAD)) return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, &info.info); break; default: break; } return NOTIFY_DONE; } /* Update the PMTU of exceptions when: * - the new MTU of the first hop becomes smaller than the PMTU * - the old MTU was the same as the PMTU, and it limited discovery of * larger MTUs on the path. With that limit raised, we can now * discover larger MTUs * A special case is locked exceptions, for which the PMTU is smaller * than the minimal accepted PMTU: * - if the new MTU is greater than the PMTU, don't make any change * - otherwise, unlock and set PMTU */ void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig) { struct fnhe_hash_bucket *bucket; int i; bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1); if (!bucket) return; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); fnhe; fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { if (fnhe->fnhe_mtu_locked) { if (new <= fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; fnhe->fnhe_mtu_locked = false; } } else if (new < fnhe->fnhe_pmtu || orig == fnhe->fnhe_pmtu) { fnhe->fnhe_pmtu = new; } } } } void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) { struct hlist_head *head = fib_nh_head(dev); struct fib_nh *nh; hlist_for_each_entry(nh, head, nh_hash) { DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu); } } /* Event force Flags Description * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed * * only used when fib_nh is built into fib_info */ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { struct hlist_head *head = fib_nh_head(dev); struct fib_info *prev_fi = NULL; int scope = RT_SCOPE_NOWHERE; struct fib_nh *nh; int ret = 0; if (force) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int dead; BUG_ON(!fi->fib_nhs); DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (fi == prev_fi) continue; prev_fi = fi; dead = 0; change_nexthops(fi) { if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) dead++; else if (nexthop_nh->fib_nh_dev == dev && nexthop_nh->fib_nh_scope != scope) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; break; } call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_DEL); dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (event == NETDEV_UNREGISTER && nexthop_nh->fib_nh_dev == dev) { dead = fi->fib_nhs; break; } #endif } endfor_nexthops(fi) if (dead == fi->fib_nhs) { switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: fi->fib_flags |= RTNH_F_DEAD; fallthrough; case NETDEV_CHANGE: fi->fib_flags |= RTNH_F_LINKDOWN; break; } ret++; } fib_rebalance(fi); } return ret; } /* Must be invoked inside of an RCU protected region. */ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; u8 slen = 32 - res->prefixlen; int order = -1, last_idx = -1; struct fib_alias *fa, *fa1 = NULL; u32 last_prio = res->fi->fib_priority; dscp_t last_dscp = 0; hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; struct fib_nh_common *nhc; if (fa->fa_slen != slen) continue; if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; if (fa->tb_id != tb->tb_id) continue; if (next_fi->fib_priority > last_prio && fa->fa_dscp == last_dscp) { if (last_dscp) continue; break; } if (next_fi->fib_flags & RTNH_F_DEAD) continue; last_dscp = fa->fa_dscp; last_prio = next_fi->fib_priority; if (next_fi->fib_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; nhc = fib_info_nhc(next_fi, 0); if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK) continue; fib_alias_accessed(fa); if (!fi) { if (next_fi != res->fi) break; fa1 = fa; } else if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } fi = next_fi; order++; } if (order <= 0 || !fi) { if (fa1) fa1->fa_default = -1; goto out; } if (!fib_detect_death(fi, order, &last_resort, &last_idx, fa1->fa_default)) { fib_result_assign(res, fi); fa1->fa_default = order; goto out; } if (last_idx >= 0) fib_result_assign(res, last_resort); fa1->fa_default = last_idx; out: return; } /* * Dead device goes up. We wake up dead nexthops. * It takes sense only on multipath routes. * * only used when fib_nh is built into fib_info */ int fib_sync_up(struct net_device *dev, unsigned char nh_flags) { struct fib_info *prev_fi; struct hlist_head *head; struct fib_nh *nh; int ret; if (!(dev->flags & IFF_UP)) return 0; if (nh_flags & RTNH_F_DEAD) { unsigned int flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) nh_flags |= RTNH_F_LINKDOWN; } prev_fi = NULL; head = fib_nh_head(dev); ret = 0; hlist_for_each_entry(nh, head, nh_hash) { struct fib_info *fi = nh->nh_parent; int alive; BUG_ON(!fi->fib_nhs); DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev); if (fi == prev_fi) continue; prev_fi = fi; alive = 0; change_nexthops(fi) { if (!(nexthop_nh->fib_nh_flags & nh_flags)) { alive++; continue; } if (!nexthop_nh->fib_nh_dev || !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->fib_nh_dev != dev || !__in_dev_get_rtnl(dev)) continue; alive++; nexthop_nh->fib_nh_flags &= ~nh_flags; call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); } endfor_nexthops(fi) if (alive > 0) { fi->fib_flags &= ~nh_flags; ret++; } fib_rebalance(fi); } return ret; } #ifdef CONFIG_IP_ROUTE_MULTIPATH static bool fib_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; if (nh->fib_nh_scope == RT_SCOPE_LINK) { struct neighbour *n; rcu_read_lock(); if (likely(nh->fib_nh_gw_family == AF_INET)) n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); else if (nh->fib_nh_gw_family == AF_INET6) n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); else n = NULL; if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); } return !!(state & NUD_VALID); } void fib_select_multipath(struct fib_result *res, int hash, const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; bool found = false; bool use_neigh; __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); saddr = fl4 ? fl4->saddr : 0; change_nexthops(fi) { int nh_upper_bound; /* Nexthops without a carrier are assigned an upper bound of * minus one when "ignore_routes_with_linkdown" is set. */ nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound); if (nh_upper_bound == -1 || (use_neigh && !fib_good_nh(nexthop_nh))) continue; if (!found) { res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; found = !saddr || nexthop_nh->nh_saddr == saddr; } if (hash > nh_upper_bound) continue; if (!saddr || nexthop_nh->nh_saddr == saddr) { res->nh_sel = nhsel; res->nhc = &nexthop_nh->nh_common; return; } if (found) return; } endfor_nexthops(fi); } #endif void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb) { if (fl4->flowi4_oif) goto check_saddr; #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); fib_select_multipath(res, h, fl4); } else #endif if (!res->prefixlen && res->table->tb_num_default > 1 && res->type == RTN_UNICAST) fib_select_default(fl4, res); check_saddr: if (!fl4->saddr) { struct net_device *l3mdev; l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev); if (!l3mdev || l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev) fl4->saddr = fib_result_prefsrc(net, res); else fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } int __net_init fib4_semantics_init(struct net *net) { unsigned int hash_bits = 4; net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); if (!net->ipv4.fib_info_hash) return -ENOMEM; net->ipv4.fib_info_hash_bits = hash_bits; net->ipv4.fib_info_cnt = 0; return 0; } void __net_exit fib4_semantics_exit(struct net *net) { fib_info_hash_free(net->ipv4.fib_info_hash); }
805 806 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch);
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); if (!flow) return NULL; flow->rule = flow_rule_alloc(num_actions); if (!flow->rule) { kfree(flow); return NULL; } flow->rule->match.dissector = &flow->match.dissector; flow->rule->match.mask = &flow->match.mask; flow->rule->match.key = &flow->match.key; return flow; } void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type) { struct nft_flow_match *match = &flow->match; struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } struct nft_offload_ethertype { __be16 value; __be16 mask; }; static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; struct nft_offload_ethertype ethertype = { .value = match->key.basic.n_proto, .mask = match->mask.basic.n_proto, }; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); } else if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->offload_action && expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); } if (num_actions == 0) return ERR_PTR(-EOPNOTSUPP); flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto err_out; } ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } nft_flow_rule_transfer_vlan(ctx, flow); flow->proto = ctx->dep.l3num; kfree(ctx); return flow; err_out: kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); } void nft_flow_rule_destroy(struct nft_flow_rule *flow) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, &flow->rule->action) { switch (entry->id) { case FLOW_ACTION_REDIRECT: case FLOW_ACTION_MIRRED: dev_put(entry->dev); break; default: break; } } kfree(flow->rule); kfree(flow); } void nft_offload_set_dependency(struct nft_offload_ctx *ctx, enum nft_offload_dep_type type) { ctx->dep.type = type; } void nft_offload_update_dependency(struct nft_offload_ctx *ctx, const void *data, u32 len) { switch (ctx->dep.type) { case NFT_OFFLOAD_DEP_NETWORK: WARN_ON(len != sizeof(__u16)); memcpy(&ctx->dep.l3num, data, sizeof(__u16)); break; case NFT_OFFLOAD_DEP_TRANSPORT: WARN_ON(len != sizeof(__u8)); memcpy(&ctx->dep.protonum, data, sizeof(__u8)); break; default: break; } ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, __be16 proto, int priority, struct netlink_ext_ack *extack) { common->protocol = proto; common->prio = priority; common->extack = extack; } static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; } return 0; } static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) return -1; return 0; } bool nft_chain_offload_support(const struct nft_base_chain *basechain) { struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; if (nft_chain_offload_priority(basechain) < 0) return false; list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (ops->pf != NFPROTO_NETDEV || ops->hooknum != NF_NETDEV_INGRESS) return false; dev = ops->dev; if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) return false; } } return true; } static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, struct netlink_ext_ack *extack, enum flow_cls_command command) { __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) cls_flow->rule = flow->rule; } static int nft_flow_offload_cmd(const struct nft_chain *chain, const struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command, struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } static int nft_flow_offload_rule(const struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { struct flow_cls_offload cls_flow; return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); } int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule) { struct flow_cls_offload cls_flow = {}; struct nft_expr *expr, *next; int err; err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, &cls_flow); if (err < 0) return err; nft_rule_for_each_expr(expr, next, rule) { if (expr->ops->offload_stats) expr->ops->offload_stats(expr, &cls_flow.stats); } return 0; } static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } return 0; } static int nft_block_setup(struct nft_base_chain *basechain, struct flow_block_offload *bo, enum flow_block_command cmd) { int err; switch (cmd) { case FLOW_BLOCK_BIND: err = nft_flow_offload_bind(bo, basechain); break; case FLOW_BLOCK_UNBIND: err = nft_flow_offload_unbind(bo, basechain); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } static void nft_flow_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &basechain->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) return err; return nft_block_setup(chain, &bo, cmd); } static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; return nft_block_setup(basechain, &bo, cmd); } static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { int err; if (dev->netdev_ops->ndo_setup_tc) err = nft_block_offload_cmd(basechain, dev, cmd); else err = nft_indr_block_offload_cmd(basechain, dev, cmd); return err; } static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (this_dev && this_dev != ops->dev) continue; err = nft_chain_offload_cmd(basechain, ops->dev, cmd); if (err < 0 && cmd == FLOW_BLOCK_BIND) { if (!this_dev) goto err_flow_block; return err; } i++; } } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (i-- <= 0) break; nft_chain_offload_cmd(basechain, ops->dev, FLOW_BLOCK_UNBIND); } } return err; } static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; return nft_flow_block_chain(basechain, NULL, cmd); } static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; } if (WARN_ON_ONCE(err)) break; } } int nft_flow_rule_offload_commit(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->flags & NLM_F_REPLACE || !(trans->flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; } if (err) { nft_flow_rule_offload_abort(net, trans); break; } } return err; } static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, struct net_device *dev) { struct nft_base_chain *basechain; struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain) || !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { if (!nft_hook_find_ops(hook, dev)) continue; found = hook; break; } if (!found) continue; return chain; } } return NULL; } static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { unregister_netdevice_notifier(&nft_offload_netdev_notifier); }
99 98 99 54 54 84 84 84 94 94 49 49 48 54 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 // SPDX-License-Identifier: GPL-2.0 /* * Implementation of the diskquota system for the LINUX operating system. QUOTA * is implemented using the BSD system call interface as the means of * communication with the user level. This file contains the generic routines * called by the different filesystems on allocation of an inode or block. * These routines take care of the administration needed to have a consistent * diskquota tracking system. The ideas of both user and group quotas are based * on the Melbourne quota system as used on BSD derived systems. The internal * implementation is based on one of the several variants of the LINUX * inode-subsystem with added complexity of the diskquota system. * * Author: Marco van Wieringen <mvw@planets.elm.net> * * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96 * * Revised list management to avoid races * -- Bill Hawes, <whawes@star.net>, 9/98 * * Fixed races in dquot_transfer(), dqget() and dquot_alloc_...(). * As the consequence the locking was moved from dquot_decr_...(), * dquot_incr_...() to calling functions. * invalidate_dquots() now writes modified dquots. * Serialized quota_off() and quota_on() for mount point. * Fixed a few bugs in grow_dquots(). * Fixed deadlock in write_dquot() - we no longer account quotas on * quota files * remove_dquot_ref() moved to inode.c - it now traverses through inodes * add_dquot_ref() restarts after blocking * Added check for bogus uid and fixed check for group in quotactl. * Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99 * * Used struct list_head instead of own list struct * Invalidation of referenced dquots is no longer possible * Improved free_dquots list management * Quota and i_blocks are now updated in one place to avoid races * Warnings are now delayed so we won't block in critical section * Write updated not to require dquot lock * Jan Kara, <jack@suse.cz>, 9/2000 * * Added dynamic quota structure allocation * Jan Kara <jack@suse.cz> 12/2000 * * Rewritten quota interface. Implemented new quota format and * formats registering. * Jan Kara, <jack@suse.cz>, 2001,2002 * * New SMP locking. * Jan Kara, <jack@suse.cz>, 10/2002 * * Added journalled quota support, fix lock inversion problems * Jan Kara, <jack@suse.cz>, 2003,2004 * * (C) Copyright 1994 - 1997 Marco van Wieringen */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/mm.h> #include <linux/time.h> #include <linux/types.h> #include <linux/string.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/tty.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/sched.h> #include <linux/cred.h> #include <linux/kmod.h> #include <linux/namei.h> #include <linux/capability.h> #include <linux/quotaops.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/uaccess.h> /* * There are five quota SMP locks: * * dq_list_lock protects all lists with quotas and quota formats. * * dquot->dq_dqb_lock protects data from dq_dqb * * inode->i_lock protects inode->i_blocks, i_bytes and also guards * consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that * dquot_transfer() can stabilize amount it transfers * * dq_data_lock protects mem_dqinfo structures and modifications of dquot * pointers in the inode * * dq_state_lock protects modifications of quota state (on quotaon and * quotaoff) and readers who care about latest values take it as well. * * The spinlock ordering is hence: * dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock, * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock * * Operation accessing dquots via inode pointers are protected by dquot_srcu. * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and * synchronize_srcu(&dquot_srcu) is called after clearing pointers from * inode and before dropping dquot references to avoid use of dquots after * they are freed. dq_data_lock is used to serialize the pointer setting and * clearing operations. * Special care needs to be taken about S_NOQUOTA inode flag (marking that * inode is a quota file). Functions adding pointers from inode to dquots have * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they * have to do all pointer modifications before dropping dq_data_lock. This makes * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * * Each dquot has its dq_lock mutex. Dquot is locked when it is being read to * memory (or space for it is being allocated) on the first dqget(), when it is * being written out, and when it is being released on the last dqput(). The * allocation and release operations are serialized by the dq_lock and by * checking the use count in dquot_release(). * * Lock ordering (including related VFS locks) is the following: * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); DEFINE_STATIC_SRCU(dquot_srcu); static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq); void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...) { if (printk_ratelimit()) { va_list args; struct va_format vaf; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_ERR "Quota error (device %s): %s: %pV\n", sb->s_id, func, &vaf); va_end(args); } } EXPORT_SYMBOL(__quota_error); #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) static char *quotatypes[] = INITQFNAMES; #endif static struct quota_format_type *quota_formats; /* List of registered formats */ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; /* SLAB cache for dquot structures */ static struct kmem_cache *dquot_cachep; void register_quota_format(struct quota_format_type *fmt) { spin_lock(&dq_list_lock); fmt->qf_next = quota_formats; quota_formats = fmt; spin_unlock(&dq_list_lock); } EXPORT_SYMBOL(register_quota_format); void unregister_quota_format(struct quota_format_type *fmt) { struct quota_format_type **actqf; spin_lock(&dq_list_lock); for (actqf = &quota_formats; *actqf && *actqf != fmt; actqf = &(*actqf)->qf_next) ; if (*actqf) *actqf = (*actqf)->qf_next; spin_unlock(&dq_list_lock); } EXPORT_SYMBOL(unregister_quota_format); static struct quota_format_type *find_quota_format(int id) { struct quota_format_type *actqf; spin_lock(&dq_list_lock); for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next) ; if (!actqf || !try_module_get(actqf->qf_owner)) { int qm; spin_unlock(&dq_list_lock); for (qm = 0; module_names[qm].qm_fmt_id && module_names[qm].qm_fmt_id != id; qm++) ; if (!module_names[qm].qm_fmt_id || request_module(module_names[qm].qm_mod_name)) return NULL; spin_lock(&dq_list_lock); for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next) ; if (actqf && !try_module_get(actqf->qf_owner)) actqf = NULL; } spin_unlock(&dq_list_lock); return actqf; } static void put_quota_format(struct quota_format_type *fmt) { module_put(fmt->qf_owner); } /* * Dquot List Management: * The quota code uses five lists for dquot management: the inuse_list, * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array. * A single dquot structure may be on some of those lists, depending on * its current state. * * All dquots are placed to the end of inuse_list when first created, and this * list is used for invalidate operation, which must look at every dquot. * * When the last reference of a dquot is dropped, the dquot is added to * releasing_dquots. We'll then queue work item which will call * synchronize_srcu() and after that perform the final cleanup of all the * dquots on the list. Each cleaned up dquot is moved to free_dquots list. * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot * struct. * * Unused and cleaned up dquots are in the free_dquots list and this list is * searched whenever we need an available dquot. Dquots are removed from the * list as soon as they are used again and dqstats.free_dquots gives the number * of dquots on the list. When dquot is invalidated it's completely released * from memory. * * Dirty dquots are added to the dqi_dirty_list of quota_info when mark * dirtied, and this list is searched when writing dirty dquots back to * quota file. Note that some filesystems do dirty dquot tracking on their * own (e.g. in a journal) and thus don't use dqi_dirty_list. * * Dquots with a specific identity (device, type and id) are placed on * one of the dquot_hash[] hash chains. The provides an efficient search * mechanism to locate a specific dquot. */ static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); static LIST_HEAD(releasing_dquots); static unsigned int dq_hash_bits, dq_hash_mask; static struct hlist_head *dquot_hash; struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); static qsize_t __inode_get_rsv_space(struct inode *inode); static int __dquot_initialize(struct inode *inode, int type); static void quota_release_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn); static inline unsigned int hashfn(const struct super_block *sb, struct kqid qid) { unsigned int id = from_kqid(&init_user_ns, qid); int type = qid.type; unsigned long tmp; tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type); return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask; } /* * Following list functions expect dq_list_lock to be held */ static inline void insert_dquot_hash(struct dquot *dquot) { struct hlist_head *head; head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id); hlist_add_head(&dquot->dq_hash, head); } static inline void remove_dquot_hash(struct dquot *dquot) { hlist_del_init(&dquot->dq_hash); } static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, struct kqid qid) { struct dquot *dquot; hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash) if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid)) return dquot; return NULL; } /* Add a dquot to the tail of the free list */ static inline void put_dquot_last(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &free_dquots); dqstats_inc(DQST_FREE_DQUOTS); } static inline void put_releasing_dquots(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &releasing_dquots); set_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void remove_free_dquot(struct dquot *dquot) { if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags)) dqstats_dec(DQST_FREE_DQUOTS); else clear_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void put_inuse(struct dquot *dquot) { /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ list_add_tail(&dquot->dq_inuse, &inuse_list); dqstats_inc(DQST_ALLOC_DQUOTS); } static inline void remove_inuse(struct dquot *dquot) { dqstats_dec(DQST_ALLOC_DQUOTS); list_del(&dquot->dq_inuse); } /* * End of list functions needing dq_list_lock */ static void wait_on_dquot(struct dquot *dquot) { mutex_lock(&dquot->dq_lock); mutex_unlock(&dquot->dq_lock); } static inline int dquot_active(struct dquot *dquot) { return test_bit(DQ_ACTIVE_B, &dquot->dq_flags); } static inline int dquot_dirty(struct dquot *dquot) { return test_bit(DQ_MOD_B, &dquot->dq_flags); } static inline int mark_dquot_dirty(struct dquot *dquot) { return dquot->dq_sb->dq_op->mark_dirty(dquot); } /* Mark dquot dirty in atomic manner, and return it's old dirty flag state */ int dquot_mark_dquot_dirty(struct dquot *dquot) { int ret = 1; if (!dquot_active(dquot)) return 0; if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags); /* If quota is dirty already, we don't have to acquire dq_list_lock */ if (dquot_dirty(dquot)) return 1; spin_lock(&dq_list_lock); if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) { list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)-> info[dquot->dq_id.type].dqi_dirty_list); ret = 0; } spin_unlock(&dq_list_lock); return ret; } EXPORT_SYMBOL(dquot_mark_dquot_dirty); /* Dirtify all the dquots - this can block when journalling */ static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots) { int ret, err, cnt; struct dquot *dquot; ret = err = 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) /* Even in case of error we have to continue */ ret = mark_dquot_dirty(dquot); if (!err && ret < 0) err = ret; } return err; } static inline void dqput_all(struct dquot **dquot) { unsigned int cnt; for (cnt = 0; cnt < MAXQUOTAS; cnt++) dqput(dquot[cnt]); } static inline int clear_dquot_dirty(struct dquot *dquot) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags); spin_lock(&dq_list_lock); if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); return 0; } list_del_init(&dquot->dq_dirty); spin_unlock(&dq_list_lock); return 1; } void mark_info_dirty(struct super_block *sb, int type) { spin_lock(&dq_data_lock); sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY; spin_unlock(&dq_data_lock); } EXPORT_SYMBOL(mark_info_dirty); /* * Read dquot from disk and alloc space for it */ int dquot_acquire(struct dquot *dquot) { int ret = 0, ret2 = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) { ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) goto out_iolock; } /* Make sure flags update is visible after dquot has been filled */ smp_mb__before_atomic(); set_bit(DQ_READ_B, &dquot->dq_flags); /* Instantiate dquot if needed */ if (!dquot_active(dquot) && !dquot->dq_off) { ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); /* Write the info if needed */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( dquot->dq_sb, dquot->dq_id.type); } if (ret < 0) goto out_iolock; if (ret2 < 0) { ret = ret2; goto out_iolock; } } /* * Make sure flags update is visible after on-disk struct has been * allocated. Paired with smp_rmb() in dqget(). */ smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_acquire); /* * Write dquot to disk */ int dquot_commit(struct dquot *dquot) { int ret = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); if (!clear_dquot_dirty(dquot)) goto out_lock; /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ if (dquot_active(dquot)) ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; out_lock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_commit); /* * Release dquot */ int dquot_release(struct dquot *dquot) { int ret = 0, ret2 = 0; unsigned int memalloc; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); memalloc = memalloc_nofs_save(); /* Check whether we are not racing with some other dqget() */ if (dquot_is_busy(dquot)) goto out_dqlock; if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); /* Write the info */ if (info_dirty(&dqopt->info[dquot->dq_id.type])) { ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info( dquot->dq_sb, dquot->dq_id.type); } if (ret >= 0) ret = ret2; } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_dqlock: memalloc_nofs_restore(memalloc); mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_release); void dquot_destroy(struct dquot *dquot) { kmem_cache_free(dquot_cachep, dquot); } EXPORT_SYMBOL(dquot_destroy); static inline void do_destroy_dquot(struct dquot *dquot) { dquot->dq_sb->dq_op->destroy_dquot(dquot); } /* Invalidate all dquots on the list. Note that this function is called after * quota is disabled and pointers from inodes removed so there cannot be new * quota users. There can still be some users of quotas due to inodes being * just deleted or pruned by prune_icache() (those are not attached to any * list) or parallel quotactl call. We have to wait for such users. */ static void invalidate_dquots(struct super_block *sb, int type) { struct dquot *dquot, *tmp; restart: flush_delayed_work(&quota_release_work); spin_lock(&dq_list_lock); list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { if (dquot->dq_sb != sb) continue; if (dquot->dq_id.type != type) continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); /* * Once dqput() wakes us up, we know it's time to free * the dquot. * IMPORTANT: we rely on the fact that there is always * at most one process waiting for dquot to free. * Otherwise dq_count would be > 1 and we would never * wake up. */ wait_event(dquot_ref_wq, atomic_read(&dquot->dq_count) == 1); dqput(dquot); /* At this moment dquot() need not exist (it could be * reclaimed by prune_dqcache(). Hence we must * restart. */ goto restart; } /* * The last user already dropped its reference but dquot didn't * get fully cleaned up yet. Restart the scan which flushes the * work cleaning up released dquots. */ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); goto restart; } /* * Quota now has no users and it has been written on last * dqput() */ remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); } spin_unlock(&dq_list_lock); } /* Call callback for every active dquot on given filesystem */ int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv) { struct dquot *dquot, *old_dquot = NULL; int ret = 0; WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); spin_lock(&dq_list_lock); list_for_each_entry(dquot, &inuse_list, dq_inuse) { if (!dquot_active(dquot)) continue; if (dquot->dq_sb != sb) continue; /* Now we have active dquot so we can just increase use count */ atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); dqput(old_dquot); old_dquot = dquot; /* * ->release_dquot() can be racing with us. Our reference * protects us from new calls to it so just wait for any * outstanding call and recheck the DQ_ACTIVE_B after that. */ wait_on_dquot(dquot); if (dquot_active(dquot)) { ret = fn(dquot, priv); if (ret < 0) goto out; } spin_lock(&dq_list_lock); /* We are safe to continue now because our dquot could not * be moved out of the inuse list while we hold the reference */ } spin_unlock(&dq_list_lock); out: dqput(old_dquot); return ret; } EXPORT_SYMBOL(dquot_scan_active); static inline int dquot_write_dquot(struct dquot *dquot) { int ret = dquot->dq_sb->dq_op->write_dquot(dquot); if (ret < 0) { quota_error(dquot->dq_sb, "Can't write quota structure " "(error %d). Quota may get out of sync!", ret); /* Clear dirty bit anyway to avoid infinite loop. */ clear_dquot_dirty(dquot); } return ret; } /* Write all dquot structures to quota files */ int dquot_writeback_dquots(struct super_block *sb, int type) { struct list_head dirty; struct dquot *dquot; struct quota_info *dqopt = sb_dqopt(sb); int cnt; int err, ret = 0; WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); flush_delayed_work(&quota_release_work); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; spin_lock(&dq_list_lock); /* Move list away to avoid livelock. */ list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty); while (!list_empty(&dirty)) { dquot = list_first_entry(&dirty, struct dquot, dq_dirty); WARN_ON(!dquot_active(dquot)); /* If the dquot is releasing we should not touch it */ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); flush_delayed_work(&quota_release_work); spin_lock(&dq_list_lock); continue; } /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ dqgrab(dquot); spin_unlock(&dq_list_lock); err = dquot_write_dquot(dquot); if (err && !ret) ret = err; dqput(dquot); spin_lock(&dq_list_lock); } spin_unlock(&dq_list_lock); } for (cnt = 0; cnt < MAXQUOTAS; cnt++) if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) && info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); dqstats_inc(DQST_SYNCS); return ret; } EXPORT_SYMBOL(dquot_writeback_dquots); /* Write all dquot structures to disk and make them visible from userspace */ int dquot_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); int cnt; int ret; ret = dquot_writeback_dquots(sb, type); if (ret) return ret; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) return 0; /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ if (sb->s_op->sync_fs) { ret = sb->s_op->sync_fs(sb, 1); if (ret) return ret; } ret = sync_blockdev(sb->s_bdev); if (ret) return ret; /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_active(sb, cnt)) continue; inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } return 0; } EXPORT_SYMBOL(dquot_quota_sync); static unsigned long dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct dquot *dquot; unsigned long freed = 0; spin_lock(&dq_list_lock); while (!list_empty(&free_dquots) && sc->nr_to_scan) { dquot = list_first_entry(&free_dquots, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); sc->nr_to_scan--; freed++; } spin_unlock(&dq_list_lock); return freed; } static unsigned long dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio( percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } /* * Safely release dquot and put reference to dquot. */ static void quota_release_workfn(struct work_struct *work) { struct dquot *dquot; struct list_head rls_head; spin_lock(&dq_list_lock); /* Exchange the list head to avoid livelock. */ list_replace_init(&releasing_dquots, &rls_head); spin_unlock(&dq_list_lock); synchronize_srcu(&dquot_srcu); restart: spin_lock(&dq_list_lock); while (!list_empty(&rls_head)) { dquot = list_first_entry(&rls_head, struct dquot, dq_free); WARN_ON_ONCE(atomic_read(&dquot->dq_count)); /* * Note that DQ_RELEASING_B protects us from racing with * invalidate_dquots() calls so we are safe to work with the * dquot even after we drop dq_list_lock. */ if (dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ dquot_write_dquot(dquot); goto restart; } if (dquot_active(dquot)) { spin_unlock(&dq_list_lock); dquot->dq_sb->dq_op->release_dquot(dquot); goto restart; } /* Dquot is inactive and clean, now move it to free list */ remove_free_dquot(dquot); put_dquot_last(dquot); } spin_unlock(&dq_list_lock); } /* * Put reference to dquot */ void dqput(struct dquot *dquot) { if (!dquot) return; #ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", quotatypes[dquot->dq_id.type], from_kqid(&init_user_ns, dquot->dq_id)); BUG(); } #endif dqstats_inc(DQST_DROPS); spin_lock(&dq_list_lock); if (atomic_read(&dquot->dq_count) > 1) { /* We have more than one user... nothing to do */ atomic_dec(&dquot->dq_count); /* Releasing dquot during quotaoff phase? */ if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) && atomic_read(&dquot->dq_count) == 1) wake_up(&dquot_ref_wq); spin_unlock(&dq_list_lock); return; } /* Need to release dquot? */ WARN_ON_ONCE(!list_empty(&dquot->dq_free)); put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); queue_delayed_work(system_unbound_wq, &quota_release_work, 1); } EXPORT_SYMBOL(dqput); struct dquot *dquot_alloc(struct super_block *sb, int type) { return kmem_cache_zalloc(dquot_cachep, GFP_NOFS); } EXPORT_SYMBOL(dquot_alloc); static struct dquot *get_empty_dquot(struct super_block *sb, int type) { struct dquot *dquot; dquot = sb->dq_op->alloc_dquot(sb, type); if(!dquot) return NULL; mutex_init(&dquot->dq_lock); INIT_LIST_HEAD(&dquot->dq_free); INIT_LIST_HEAD(&dquot->dq_inuse); INIT_HLIST_NODE(&dquot->dq_hash); INIT_LIST_HEAD(&dquot->dq_dirty); dquot->dq_sb = sb; dquot->dq_id = make_kqid_invalid(type); atomic_set(&dquot->dq_count, 1); spin_lock_init(&dquot->dq_dqb_lock); return dquot; } /* * Get reference to dquot * * Locking is slightly tricky here. We are guarded from parallel quotaoff() * destroying our dquot by: * a) checking for quota flags under dq_list_lock and * b) getting a reference to dquot before we release dq_list_lock */ struct dquot *dqget(struct super_block *sb, struct kqid qid) { unsigned int hashent = hashfn(sb, qid); struct dquot *dquot, *empty = NULL; if (!qid_has_mapping(sb->s_user_ns, qid)) return ERR_PTR(-EINVAL); if (!sb_has_quota_active(sb, qid.type)) return ERR_PTR(-ESRCH); we_slept: spin_lock(&dq_list_lock); spin_lock(&dq_state_lock); if (!sb_has_quota_active(sb, qid.type)) { spin_unlock(&dq_state_lock); spin_unlock(&dq_list_lock); dquot = ERR_PTR(-ESRCH); goto out; } spin_unlock(&dq_state_lock); dquot = find_dquot(hashent, sb, qid); if (!dquot) { if (!empty) { spin_unlock(&dq_list_lock); empty = get_empty_dquot(sb, qid.type); if (!empty) schedule(); /* Try to wait for a moment... */ goto we_slept; } dquot = empty; empty = NULL; dquot->dq_id = qid; /* all dquots go on the inuse_list */ put_inuse(dquot); /* hash it first so it can be found */ insert_dquot_hash(dquot); spin_unlock(&dq_list_lock); dqstats_inc(DQST_LOOKUPS); } else { if (!atomic_read(&dquot->dq_count)) remove_free_dquot(dquot); atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); dqstats_inc(DQST_CACHE_HITS); dqstats_inc(DQST_LOOKUPS); } /* Wait for dq_lock - after this we know that either dquot_release() is * already finished or it will be canceled due to dq_count > 0 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ if (!dquot_active(dquot)) { int err; err = sb->dq_op->acquire_dquot(dquot); if (err < 0) { dqput(dquot); dquot = ERR_PTR(err); goto out; } } /* * Make sure following reads see filled structure - paired with * smp_mb__before_atomic() in dquot_acquire(). */ smp_rmb(); /* Has somebody invalidated entry under us? */ WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash)); out: if (empty) do_destroy_dquot(empty); return dquot; } EXPORT_SYMBOL(dqget); static inline struct dquot __rcu **i_dquot(struct inode *inode) { return inode->i_sb->s_op->get_dquots(inode); } static int dqinit_needed(struct inode *inode, int type) { struct dquot __rcu * const *dquots; int cnt; if (IS_NOQUOTA(inode)) return 0; dquots = i_dquot(inode); if (type != -1) return !dquots[type]; for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!dquots[cnt]) return 1; return 0; } /* This routine is guarded by s_umount semaphore */ static int add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif int err = 0; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || !atomic_read(&inode->i_writecount) || !dqinit_needed(inode, type)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif iput(old_inode); err = __dquot_initialize(inode, type); if (err) { iput(inode); goto out; } /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); out: #ifdef CONFIG_QUOTA_DEBUG if (reserved) { quota_error(sb, "Writes happened before quota was turned on " "thus quota information is probably inconsistent. " "Please run quotacheck(8)"); } #endif return err; } static void remove_dquot_ref(struct super_block *sb, int type) { struct inode *inode; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch * only quota pointers and these have separate locking * (dq_data_lock). */ spin_lock(&dq_data_lock); if (!IS_NOQUOTA(inode)) { struct dquot __rcu **dquots = i_dquot(inode); struct dquot *dquot = srcu_dereference_check( dquots[type], &dquot_srcu, lockdep_is_held(&dq_data_lock)); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif rcu_assign_pointer(dquots[type], NULL); if (dquot) dqput(dquot); } spin_unlock(&dq_data_lock); } spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" " was disabled thus quota information is probably " "inconsistent. Please run quotacheck(8).\n", sb->s_id); } #endif } /* Gather all references from inodes and drop them */ static void drop_dquot_ref(struct super_block *sb, int type) { if (sb->dq_op) remove_dquot_ref(sb, type); } static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { if (dquot->dq_dqb.dqb_rsvspace >= number) dquot->dq_dqb.dqb_rsvspace -= number; else { WARN_ON_ONCE(1); dquot->dq_dqb.dqb_rsvspace = 0; } if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || dquot->dq_dqb.dqb_curinodes >= number) dquot->dq_dqb.dqb_curinodes -= number; else dquot->dq_dqb.dqb_curinodes = 0; if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) dquot->dq_dqb.dqb_itime = (time64_t) 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); } static void dquot_decr_space(struct dquot *dquot, qsize_t number) { if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || dquot->dq_dqb.dqb_curspace >= number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } struct dquot_warn { struct super_block *w_sb; struct kqid w_dq_id; short w_type; }; static int warning_issued(struct dquot *dquot, const int warntype) { int flag = (warntype == QUOTA_NL_BHARDWARN || warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B : ((warntype == QUOTA_NL_IHARDWARN || warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0); if (!flag) return 0; return test_and_set_bit(flag, &dquot->dq_flags); } #ifdef CONFIG_PRINT_QUOTA_WARNING static int flag_print_warnings = 1; static int need_print_warning(struct dquot_warn *warn) { if (!flag_print_warnings) return 0; switch (warn->w_dq_id.type) { case USRQUOTA: return uid_eq(current_fsuid(), warn->w_dq_id.uid); case GRPQUOTA: return in_group_p(warn->w_dq_id.gid); case PRJQUOTA: return 1; } return 0; } /* Print warning to user which exceeded quota */ static void print_warning(struct dquot_warn *warn) { char *msg = NULL; struct tty_struct *tty; int warntype = warn->w_type; if (warntype == QUOTA_NL_IHARDBELOW || warntype == QUOTA_NL_ISOFTBELOW || warntype == QUOTA_NL_BHARDBELOW || warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn)) return; tty = get_current_tty(); if (!tty) return; tty_write_message(tty, warn->w_sb->s_id); if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) tty_write_message(tty, ": warning, "); else tty_write_message(tty, ": write failed, "); tty_write_message(tty, quotatypes[warn->w_dq_id.type]); switch (warntype) { case QUOTA_NL_IHARDWARN: msg = " file limit reached.\r\n"; break; case QUOTA_NL_ISOFTLONGWARN: msg = " file quota exceeded too long.\r\n"; break; case QUOTA_NL_ISOFTWARN: msg = " file quota exceeded.\r\n"; break; case QUOTA_NL_BHARDWARN: msg = " block limit reached.\r\n"; break; case QUOTA_NL_BSOFTLONGWARN: msg = " block quota exceeded too long.\r\n"; break; case QUOTA_NL_BSOFTWARN: msg = " block quota exceeded.\r\n"; break; } tty_write_message(tty, msg); tty_kref_put(tty); } #endif static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot, int warntype) { if (warning_issued(dquot, warntype)) return; warn->w_type = warntype; warn->w_sb = dquot->dq_sb; warn->w_dq_id = dquot->dq_id; } /* * Write warnings to the console and send warning messages over netlink. * * Note that this function can call into tty and networking code. */ static void flush_warnings(struct dquot_warn *warn) { int i; for (i = 0; i < MAXQUOTAS; i++) { if (warn[i].w_type == QUOTA_NL_NOWARN) continue; #ifdef CONFIG_PRINT_QUOTA_WARNING print_warning(&warn[i]); #endif quota_send_warning(warn[i].w_dq_id, warn[i].w_sb->s_dev, warn[i].w_type); } } static int ignore_hardlimit(struct dquot *dquot) { struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; return capable(CAP_SYS_RESOURCE) && (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & DQF_ROOT_SQUASH)); } static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes, struct dquot_warn *warn) { qsize_t newinodes; int ret = 0; spin_lock(&dquot->dq_dqb_lock); newinodes = dquot->dq_dqb.dqb_curinodes + inodes; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) goto add; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); ret = -EDQUOT; goto out; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); ret = -EDQUOT; goto out; } if (dquot->dq_dqb.dqb_isoftlimit && newinodes > dquot->dq_dqb.dqb_isoftlimit && dquot->dq_dqb.dqb_itime == 0) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN); dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; } add: dquot->dq_dqb.dqb_curinodes = newinodes; out: spin_unlock(&dquot->dq_dqb_lock); return ret; } static int dquot_add_space(struct dquot *dquot, qsize_t space, qsize_t rsv_space, unsigned int flags, struct dquot_warn *warn) { qsize_t tspace; struct super_block *sb = dquot->dq_sb; int ret = 0; spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) goto finish; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace + space + rsv_space; if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); ret = -EDQUOT; goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); ret = -EDQUOT; goto finish; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { if (flags & DQUOT_SPACE_WARN) { prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; } else { /* * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ ret = -EDQUOT; goto finish; } } finish: /* * We have to be careful and go through warning generation & grace time * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it * only here... */ if (flags & DQUOT_SPACE_NOFAIL) ret = 0; if (!ret) { dquot->dq_dqb.dqb_rsvspace += rsv_space; dquot->dq_dqb.dqb_curspace += space; } spin_unlock(&dquot->dq_dqb_lock); return ret; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) { qsize_t newinodes; if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type)) return QUOTA_NL_NOWARN; newinodes = dquot->dq_dqb.dqb_curinodes - inodes; if (newinodes <= dquot->dq_dqb.dqb_isoftlimit) return QUOTA_NL_ISOFTBELOW; if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit && newinodes < dquot->dq_dqb.dqb_ihardlimit) return QUOTA_NL_IHARDBELOW; return QUOTA_NL_NOWARN; } static int info_bdq_free(struct dquot *dquot, qsize_t space) { qsize_t tspace; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace; if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || tspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; if (tspace >= dquot->dq_dqb.dqb_bhardlimit && tspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } static int inode_quota_active(const struct inode *inode) { struct super_block *sb = inode->i_sb; if (IS_NOQUOTA(inode)) return 0; return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); } /* * Initialize quota pointers in inode * * It is better to call this function outside of any transaction as it * might need a lot of space in journal for dquot structure allocation. */ static int __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; struct dquot __rcu **dquots; struct dquot *got[MAXQUOTAS] = {}; struct super_block *sb = inode->i_sb; qsize_t rsv; int ret = 0; if (!inode_quota_active(inode)) return 0; dquots = i_dquot(inode); /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { struct kqid qid; kprojid_t projid; int rc; struct dquot *dquot; if (type != -1 && cnt != type) continue; /* * The i_dquot should have been initialized in most cases, * we check it without locking here to avoid unnecessary * dqget()/dqput() calls. */ if (dquots[cnt]) continue; if (!sb_has_quota_active(sb, cnt)) continue; init_needed = 1; switch (cnt) { case USRQUOTA: qid = make_kqid_uid(inode->i_uid); break; case GRPQUOTA: qid = make_kqid_gid(inode->i_gid); break; case PRJQUOTA: rc = inode->i_sb->dq_op->get_projid(inode, &projid); if (rc) continue; qid = make_kqid_projid(projid); break; } dquot = dqget(sb, qid); if (IS_ERR(dquot)) { /* We raced with somebody turning quotas off... */ if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } got[cnt] = dquot; } /* All required i_dquot has been initialized */ if (!init_needed) return 0; spin_lock(&dq_data_lock); if (IS_NOQUOTA(inode)) goto out_lock; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; /* Avoid races with quotaoff() */ if (!sb_has_quota_active(sb, cnt)) continue; /* We could race with quotaon or dqget() could have failed */ if (!got[cnt]) continue; if (!dquots[cnt]) { rcu_assign_pointer(dquots[cnt], got[cnt]); got[cnt] = NULL; /* * Make quota reservation system happy if someone * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); if (unlikely(rsv)) { struct dquot *dquot = srcu_dereference_check( dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); spin_lock(&inode->i_lock); /* Get reservation again under proper lock */ rsv = __inode_get_rsv_space(inode); spin_lock(&dquot->dq_dqb_lock); dquot->dq_dqb.dqb_rsvspace += rsv; spin_unlock(&dquot->dq_dqb_lock); spin_unlock(&inode->i_lock); } } } out_lock: spin_unlock(&dq_data_lock); out_put: /* Drop unused references */ dqput_all(got); return ret; } int dquot_initialize(struct inode *inode) { return __dquot_initialize(inode, -1); } EXPORT_SYMBOL(dquot_initialize); bool dquot_initialize_needed(struct inode *inode) { struct dquot __rcu **dquots; int i; if (!inode_quota_active(inode)) return false; dquots = i_dquot(inode); for (i = 0; i < MAXQUOTAS; i++) if (!dquots[i] && sb_has_quota_active(inode->i_sb, i)) return true; return false; } EXPORT_SYMBOL(dquot_initialize_needed); /* * Release all quotas referenced by inode. * * This function only be called on inode free or converting * a file to quota file, no other users for the i_dquot in * both cases, so we needn't call synchronize_srcu() after * clearing i_dquot. */ static void __dquot_drop(struct inode *inode) { int cnt; struct dquot __rcu **dquots = i_dquot(inode); struct dquot *put[MAXQUOTAS]; spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); rcu_assign_pointer(dquots[cnt], NULL); } spin_unlock(&dq_data_lock); dqput_all(put); } void dquot_drop(struct inode *inode) { struct dquot __rcu * const *dquots; int cnt; if (IS_NOQUOTA(inode)) return; /* * Test before calling to rule out calls from proc and such * where we are not allowed to block. Note that this is * actually reliable test even without the lock - the caller * must assure that nobody can come after the DQUOT_DROP and * add quota pointers back anyway. */ dquots = i_dquot(inode); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (dquots[cnt]) break; } if (cnt < MAXQUOTAS) __dquot_drop(inode); } EXPORT_SYMBOL(dquot_drop); /* * inode_reserved_space is managed internally by quota, and protected by * i_lock similar to i_blocks+i_bytes. */ static qsize_t *inode_reserved_space(struct inode * inode) { /* Filesystem must explicitly define it's own method in order to use * quota reservation interface */ BUG_ON(!inode->i_sb->dq_op->get_reserved_space); return inode->i_sb->dq_op->get_reserved_space(inode); } static qsize_t __inode_get_rsv_space(struct inode *inode) { if (!inode->i_sb->dq_op->get_reserved_space) return 0; return *inode_reserved_space(inode); } static qsize_t inode_get_rsv_space(struct inode *inode) { qsize_t ret; if (!inode->i_sb->dq_op->get_reserved_space) return 0; spin_lock(&inode->i_lock); ret = __inode_get_rsv_space(inode); spin_unlock(&inode->i_lock); return ret; } /* * This functions updates i_blocks+i_bytes fields and quota information * (together with appropriate checks). * * NOTE: We absolutely rely on the fact that caller dirties the inode * (usually helpers in quotaops.h care about this) and holds a handle for * the current transaction so that dquot write and inode write go into the * same transaction. */ /* * This operation can block, but only after everything is updated */ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; int reserve = flags & DQUOT_SPACE_RESERVE; struct dquot __rcu **dquots; struct dquot *dquot; if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; spin_unlock(&inode->i_lock); } else { inode_add_bytes(inode, number); } goto out; } for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; if (reserve) { ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]); } else { ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]); } if (ret) { /* Back out changes we already did */ for (cnt--; cnt >= 0; cnt--) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); if (reserve) dquot_free_reserved_space(dquot, number); else dquot_decr_space(dquot, number); spin_unlock(&dquot->dq_dqb_lock); } spin_unlock(&inode->i_lock); goto out_flush_warn; } } if (reserve) *inode_reserved_space(inode) += number; else __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); if (reserve) goto out_flush_warn; ret = mark_all_dquot_dirty(dquots); out_flush_warn: srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); out: return ret; } EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ int dquot_alloc_inode(struct inode *inode) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu * const *dquots; struct dquot *dquot; if (!inode_quota_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; ret = dquot_add_inodes(dquot, 1, &warn[cnt]); if (ret) { for (cnt--; cnt >= 0; cnt--) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; /* Back out changes we already did */ spin_lock(&dquot->dq_dqb_lock); dquot_decr_inodes(dquot, 1); spin_unlock(&dquot->dq_dqb_lock); } goto warn_put_all; } } warn_put_all: spin_unlock(&inode->i_lock); if (ret == 0) ret = mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); return ret; } EXPORT_SYMBOL(dquot_alloc_inode); /* * Convert in-memory reserved quotas to real consumed quotas */ void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot __rcu **dquots; struct dquot *dquot; int cnt, index; if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) { spin_lock(&dquot->dq_dqb_lock); if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number)) number = dquot->dq_dqb.dqb_rsvspace; dquot->dq_dqb.dqb_curspace += number; dquot->dq_dqb.dqb_rsvspace -= number; spin_unlock(&dquot->dq_dqb_lock); } } /* Update inode bytes */ *inode_reserved_space(inode) -= number; __inode_add_bytes(inode, number); spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); } EXPORT_SYMBOL(dquot_claim_space_nodirty); /* * Convert allocated space back to in-memory reserved quotas */ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { struct dquot __rcu **dquots; struct dquot *dquot; int cnt, index; if (!inode_quota_active(inode)) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (dquot) { spin_lock(&dquot->dq_dqb_lock); if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) number = dquot->dq_dqb.dqb_curspace; dquot->dq_dqb.dqb_rsvspace += number; dquot->dq_dqb.dqb_curspace -= number; spin_unlock(&dquot->dq_dqb_lock); } } /* Update inode bytes */ *inode_reserved_space(inode) += number; __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); } EXPORT_SYMBOL(dquot_reclaim_space_nodirty); /* * This operation can block, but only after everything is updated */ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu **dquots; struct dquot *dquot; int reserve = flags & DQUOT_SPACE_RESERVE, index; if (!inode_quota_active(inode)) { if (reserve) { spin_lock(&inode->i_lock); *inode_reserved_space(inode) -= number; spin_unlock(&inode->i_lock); } else { inode_sub_bytes(inode, number); } return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); wtype = info_bdq_free(dquot, number); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquot, wtype); if (reserve) dquot_free_reserved_space(dquot, number); else dquot_decr_space(dquot, number); spin_unlock(&dquot->dq_dqb_lock); } if (reserve) *inode_reserved_space(inode) -= number; else __inode_sub_bytes(inode, number); spin_unlock(&inode->i_lock); if (reserve) goto out_unlock; mark_all_dquot_dirty(dquots); out_unlock: srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ void dquot_free_inode(struct inode *inode) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; struct dquot __rcu * const *dquots; struct dquot *dquot; int index; if (!inode_quota_active(inode)) return; dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; dquot = srcu_dereference(dquots[cnt], &dquot_srcu); if (!dquot) continue; spin_lock(&dquot->dq_dqb_lock); wtype = info_idq_free(dquot, 1); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquot, wtype); dquot_decr_inodes(dquot, 1); spin_unlock(&dquot->dq_dqb_lock); } spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); } EXPORT_SYMBOL(dquot_free_inode); /* * Transfer the number of inode and blocks from one diskquota to an other. * On success, dquot references in transfer_to are consumed and references * to original dquots that need to be released are placed there. On failure, * references are kept untouched. * * This operation can block, but only after everything is updated * A transaction must be started when entering this function. * * We are holding reference on transfer_from & transfer_to, no need to * protect them by srcu_read_lock(). */ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { qsize_t cur_space; qsize_t rsv_space = 0; qsize_t inode_usage = 1; struct dquot __rcu **dquots; struct dquot *transfer_from[MAXQUOTAS] = {}; int cnt, index, ret = 0, err; char is_valid[MAXQUOTAS] = {}; struct dquot_warn warn_to[MAXQUOTAS]; struct dquot_warn warn_from_inodes[MAXQUOTAS]; struct dquot_warn warn_from_space[MAXQUOTAS]; if (IS_NOQUOTA(inode)) return 0; if (inode->i_sb->dq_op->get_inode_usage) { ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage); if (ret) return ret; } /* Initialize the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { warn_to[cnt].w_type = QUOTA_NL_NOWARN; warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; } spin_lock(&dq_data_lock); spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); return 0; } cur_space = __inode_get_bytes(inode); rsv_space = __inode_get_rsv_space(inode); dquots = i_dquot(inode); /* * Build the transfer_from list, check limits, and update usage in * the target structures. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { /* * Skip changes for same uid or gid or for turned off quota-type. */ if (!transfer_to[cnt]) continue; /* Avoid races with quotaoff() */ if (!sb_has_quota_active(inode->i_sb, cnt)) continue; is_valid[cnt] = 1; transfer_from[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu, lockdep_is_held(&dq_data_lock)); ret = dquot_add_inodes(transfer_to[cnt], inode_usage, &warn_to[cnt]); if (ret) goto over_quota; ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, DQUOT_SPACE_WARN, &warn_to[cnt]); if (ret) { spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); spin_unlock(&transfer_to[cnt]->dq_dqb_lock); goto over_quota; } } /* Decrease usage for source structures and update quota pointers */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!is_valid[cnt]) continue; /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; spin_lock(&transfer_from[cnt]->dq_dqb_lock); wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); wtype = info_bdq_free(transfer_from[cnt], cur_space + rsv_space); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); dquot_decr_inodes(transfer_from[cnt], inode_usage); dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); spin_unlock(&transfer_from[cnt]->dq_dqb_lock); } rcu_assign_pointer(dquots[cnt], transfer_to[cnt]); } spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); /* * These arrays are local and we hold dquot references so we don't need * the srcu protection but still take dquot_srcu to avoid warning in * mark_all_dquot_dirty(). */ index = srcu_read_lock(&dquot_srcu); err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from); if (err < 0) ret = err; err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to); if (err < 0) ret = err; srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn_to); flush_warnings(warn_from_inodes); flush_warnings(warn_from_space); /* Pass back references to put */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (is_valid[cnt]) transfer_to[cnt] = transfer_from[cnt]; return ret; over_quota: /* Back out changes we already did */ for (cnt--; cnt >= 0; cnt--) { if (!is_valid[cnt]) continue; spin_lock(&transfer_to[cnt]->dq_dqb_lock); dquot_decr_inodes(transfer_to[cnt], inode_usage); dquot_decr_space(transfer_to[cnt], cur_space); dquot_free_reserved_space(transfer_to[cnt], rsv_space); spin_unlock(&transfer_to[cnt]->dq_dqb_lock); } spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); flush_warnings(warn_to); return ret; } EXPORT_SYMBOL(__dquot_transfer); /* Wrapper for transferring ownership of an inode for uid/gid only * Called from FSXXX_setattr() */ int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { struct dquot *transfer_to[MAXQUOTAS] = {}; struct dquot *dquot; struct super_block *sb = inode->i_sb; int ret; if (!inode_quota_active(inode)) return 0; if (i_uid_needs_update(idmap, iattr, inode)) { kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode), iattr->ia_vfsuid); dquot = dqget(sb, make_kqid_uid(kuid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } transfer_to[USRQUOTA] = dquot; } if (i_gid_needs_update(idmap, iattr, inode)) { kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode), iattr->ia_vfsgid); dquot = dqget(sb, make_kqid_gid(kgid)); if (IS_ERR(dquot)) { if (PTR_ERR(dquot) != -ESRCH) { ret = PTR_ERR(dquot); goto out_put; } dquot = NULL; } transfer_to[GRPQUOTA] = dquot; } ret = __dquot_transfer(inode, transfer_to); out_put: dqput_all(transfer_to); return ret; } EXPORT_SYMBOL(dquot_transfer); /* * Write info of quota file to disk */ int dquot_commit_info(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); return dqopt->ops[type]->write_file_info(sb, type); } EXPORT_SYMBOL(dquot_commit_info); int dquot_get_next_id(struct super_block *sb, struct kqid *qid) { struct quota_info *dqopt = sb_dqopt(sb); if (!sb_has_quota_active(sb, qid->type)) return -ESRCH; if (!dqopt->ops[qid->type]->get_next_id) return -ENOSYS; return dqopt->ops[qid->type]->get_next_id(sb, qid); } EXPORT_SYMBOL(dquot_get_next_id); /* * Definitions of diskquota operations. */ const struct dquot_operations dquot_operations = { .write_dquot = dquot_commit, .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, .mark_dirty = dquot_mark_dquot_dirty, .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, .get_next_id = dquot_get_next_id, }; EXPORT_SYMBOL(dquot_operations); /* * Generic helper for ->open on filesystems supporting disk quotas. */ int dquot_file_open(struct inode *inode, struct file *file) { int error; error = generic_file_open(inode, file); if (!error && (file->f_mode & FMODE_WRITE)) error = dquot_initialize(inode); return error; } EXPORT_SYMBOL(dquot_file_open); static void vfs_cleanup_quota_inode(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); struct inode *inode = dqopt->files[type]; if (!inode) return; if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { inode_lock(inode); inode->i_flags &= ~S_NOQUOTA; inode_unlock(inode); } dqopt->files[type] = NULL; iput(inode); } /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ int dquot_disable(struct super_block *sb, int type, unsigned int flags) { int cnt; struct quota_info *dqopt = sb_dqopt(sb); rwsem_assert_held_write(&sb->s_umount); /* Cannot turn off usage accounting without turning off limits, or * suspend quotas and simultaneously turn quotas off. */ if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED | DQUOT_USAGE_ENABLED))) return -EINVAL; /* * Skip everything if there's nothing to do. We have to do this because * sometimes we are called when fill_super() failed and calling * sync_fs() in such cases does no good. */ if (!sb_any_quota_loaded(sb)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_loaded(sb, cnt)) continue; if (flags & DQUOT_SUSPENDED) { spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); } else { spin_lock(&dq_state_lock); dqopt->flags &= ~dquot_state_flag(flags, cnt); /* Turning off suspended quotas? */ if (!sb_has_quota_loaded(sb, cnt) && sb_has_quota_suspended(sb, cnt)) { dqopt->flags &= ~dquot_state_flag( DQUOT_SUSPENDED, cnt); spin_unlock(&dq_state_lock); vfs_cleanup_quota_inode(sb, cnt); continue; } spin_unlock(&dq_state_lock); } /* We still have to keep quota loaded? */ if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED)) continue; /* Note: these are blocking operations */ drop_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); /* * Now all dquots should be invalidated, all writes done so we * should be only users of the info. No locks needed. */ if (info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); if (dqopt->ops[cnt]->free_file_info) dqopt->ops[cnt]->free_file_info(sb, cnt); put_quota_format(dqopt->info[cnt].dqi_format); dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; dqopt->ops[cnt] = NULL; } /* Skip syncing and setting flags if quota files are hidden */ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) goto put_inodes; /* Sync the superblock so that buffers with quota data are written to * disk (and so userspace sees correct data afterwards). */ if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 1); sync_blockdev(sb->s_bdev); /* Now the quota files are just ordinary files and we can set the * inode flags back. Moreover we discard the pagecache so that * userspace sees the writes we did bypassing the pagecache. We * must also discard the blockdev buffers so that we see the * changes done by userspace on the next quotaon() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) { inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); put_inodes: /* We are done when suspending quotas */ if (flags & DQUOT_SUSPENDED) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) if (!sb_has_quota_loaded(sb, cnt)) vfs_cleanup_quota_inode(sb, cnt); return 0; } EXPORT_SYMBOL(dquot_disable); int dquot_quota_off(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); } EXPORT_SYMBOL(dquot_quota_off); /* * Turn quotas on on a device */ static int vfs_setup_quota_inode(struct inode *inode, int type) { struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); if (is_bad_inode(inode)) return -EUCLEAN; if (!S_ISREG(inode->i_mode)) return -EACCES; if (IS_RDONLY(inode)) return -EROFS; if (sb_has_quota_loaded(sb, type)) return -EBUSY; /* * Quota files should never be encrypted. They should be thought of as * filesystem metadata, not user data. New-style internal quota files * cannot be encrypted by users anyway, but old-style external quota * files could potentially be incorrectly created in an encrypted * directory, hence this explicit check. Some reasons why encrypted * quota files don't work include: (1) some filesystems that support * encryption don't handle it in their quota_read and quota_write, and * (2) cleaning up encrypted quota files at unmount would need special * consideration, as quota files are cleaned up later than user files. */ if (IS_ENCRYPTED(inode)) return -EINVAL; dqopt->files[type] = igrab(inode); if (!dqopt->files[type]) return -EIO; if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ inode_lock(inode); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added */ __dquot_drop(inode); } return 0; } int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags) { struct quota_format_type *fmt; struct quota_info *dqopt = sb_dqopt(sb); int error; lockdep_assert_held_write(&sb->s_umount); /* Just unsuspend quotas? */ if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED)) return -EINVAL; fmt = find_quota_format(format_id); if (!fmt) return -ESRCH; if (!sb->dq_op || !sb->s_qcop || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; } /* Filesystems outside of init_user_ns not yet supported */ if (sb->s_user_ns != &init_user_ns) { error = -EINVAL; goto out_fmt; } /* Usage always has to be set... */ if (!(flags & DQUOT_USAGE_ENABLED)) { error = -EINVAL; goto out_fmt; } if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; goto out_fmt; } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* As we bypass the pagecache we must now flush all the * dirty data and invalidate caches so that kernel sees * changes from userspace. It is not enough to just flush * the quota file since if blocksize < pagesize, invalidation * of the cache could fail because of other unrelated dirty * data */ sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } error = -EINVAL; if (!fmt->qf_ops->check_quota_file(sb, type)) goto out_fmt; dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; dqopt->info[type].dqi_fmt_id = format_id; INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); error = dqopt->ops[type]->read_file_info(sb, type); if (error < 0) goto out_fmt; if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; spin_unlock(&dq_data_lock); } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); error = add_dquot_ref(sb, type); if (error) dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; out_fmt: put_quota_format(fmt); return error; } EXPORT_SYMBOL(dquot_load_quota_sb); /* * More powerful function for turning on quotas on given quota inode allowing * setting of individual quota flags */ int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags) { int err; err = vfs_setup_quota_inode(inode, type); if (err < 0) return err; err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags); if (err < 0) vfs_cleanup_quota_inode(inode->i_sb, type); return err; } EXPORT_SYMBOL(dquot_load_quota_inode); /* Reenable quotas on remount RW */ int dquot_resume(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); int ret = 0, cnt; unsigned int flags; rwsem_assert_held_write(&sb->s_umount); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; if (!sb_has_quota_suspended(sb, cnt)) continue; spin_lock(&dq_state_lock); flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED, cnt); dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); spin_unlock(&dq_state_lock); flags = dquot_generic_flag(flags, cnt); ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id, flags); if (ret < 0) vfs_cleanup_quota_inode(sb, cnt); } return ret; } EXPORT_SYMBOL(dquot_resume); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { int error = security_quota_on(path->dentry); if (error) return error; /* Quota file not on the same filesystem? */ if (path->dentry->d_sb != sb) error = -EXDEV; else error = dquot_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; } EXPORT_SYMBOL(dquot_quota_on); /* * This function is used when filesystem needs to initialize quotas * during mount time. */ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type) { struct dentry *dentry; int error; dentry = lookup_noperm_positive_unlocked(&QSTR(qf_name), sb->s_root); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_quota_on(dentry); if (!error) error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); dput(dentry); return error; } EXPORT_SYMBOL(dquot_quota_on_mount); static int dquot_quota_enable(struct super_block *sb, unsigned int flags) { int ret; int type; struct quota_info *dqopt = sb_dqopt(sb); if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return -ENOSYS; /* Accounting cannot be turned on while fs is mounted */ flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT); if (!flags) return -EINVAL; for (type = 0; type < MAXQUOTAS; type++) { if (!(flags & qtype_enforce_flag(type))) continue; /* Can't enforce without accounting */ if (!sb_has_quota_usage_enabled(sb, type)) { ret = -EINVAL; goto out_err; } if (sb_has_quota_limits_enabled(sb, type)) { /* compatible with XFS */ ret = -EEXIST; goto out_err; } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); spin_unlock(&dq_state_lock); } return 0; out_err: /* Backout enforcement enablement we already did */ for (type--; type >= 0; type--) { if (flags & qtype_enforce_flag(type)) dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); } return ret; } static int dquot_quota_disable(struct super_block *sb, unsigned int flags) { int ret; int type; struct quota_info *dqopt = sb_dqopt(sb); if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) return -ENOSYS; /* * We don't support turning off accounting via quotactl. In principle * quota infrastructure can do this but filesystems don't expect * userspace to be able to do it. */ if (flags & (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT)) return -EOPNOTSUPP; /* Filter out limits not enabled */ for (type = 0; type < MAXQUOTAS; type++) if (!sb_has_quota_limits_enabled(sb, type)) flags &= ~qtype_enforce_flag(type); /* Nothing left? */ if (!flags) return -EEXIST; for (type = 0; type < MAXQUOTAS; type++) { if (flags & qtype_enforce_flag(type)) { ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); if (ret < 0) goto out_err; } } return 0; out_err: /* Backout enforcement disabling we already did */ for (type--; type >= 0; type--) { if (flags & qtype_enforce_flag(type)) { spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type); spin_unlock(&dq_state_lock); } } return ret; } /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; memset(di, 0, sizeof(*di)); spin_lock(&dquot->dq_dqb_lock); di->d_spc_hardlimit = dm->dqb_bhardlimit; di->d_spc_softlimit = dm->dqb_bsoftlimit; di->d_ino_hardlimit = dm->dqb_ihardlimit; di->d_ino_softlimit = dm->dqb_isoftlimit; di->d_space = dm->dqb_curspace + dm->dqb_rsvspace; di->d_ino_count = dm->dqb_curinodes; di->d_spc_timer = dm->dqb_btime; di->d_ino_timer = dm->dqb_itime; spin_unlock(&dquot->dq_dqb_lock); } int dquot_get_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *di) { struct dquot *dquot; dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); return 0; } EXPORT_SYMBOL(dquot_get_dqblk); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid, struct qc_dqblk *di) { struct dquot *dquot; int err; if (!sb->dq_op->get_next_id) return -ENOSYS; err = sb->dq_op->get_next_id(sb, qid); if (err < 0) return err; dquot = dqget(sb, *qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); do_get_dqblk(dquot, di); dqput(dquot); return 0; } EXPORT_SYMBOL(dquot_get_next_dqblk); #define VFS_QC_MASK \ (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \ QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \ QC_SPC_TIMER | QC_INO_TIMER) /* Generic routine for setting common part of quota structure */ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; int ret; if (di->d_fieldmask & ~VFS_QC_MASK) return -EINVAL; if (((di->d_fieldmask & QC_SPC_SOFT) && di->d_spc_softlimit > dqi->dqi_max_spc_limit) || ((di->d_fieldmask & QC_SPC_HARD) && di->d_spc_hardlimit > dqi->dqi_max_spc_limit) || ((di->d_fieldmask & QC_INO_SOFT) && (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) || ((di->d_fieldmask & QC_INO_HARD) && (di->d_ino_hardlimit > dqi->dqi_max_ino_limit))) return -ERANGE; spin_lock(&dquot->dq_dqb_lock); if (di->d_fieldmask & QC_SPACE) { dm->dqb_curspace = di->d_space - dm->dqb_rsvspace; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_SPC_SOFT) dm->dqb_bsoftlimit = di->d_spc_softlimit; if (di->d_fieldmask & QC_SPC_HARD) dm->dqb_bhardlimit = di->d_spc_hardlimit; if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) { check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_COUNT) { dm->dqb_curinodes = di->d_ino_count; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_SOFT) dm->dqb_isoftlimit = di->d_ino_softlimit; if (di->d_fieldmask & QC_INO_HARD) dm->dqb_ihardlimit = di->d_ino_hardlimit; if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) { check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_SPC_TIMER) { dm->dqb_btime = di->d_spc_timer; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); } if (di->d_fieldmask & QC_INO_TIMER) { dm->dqb_itime = di->d_ino_timer; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); } if (check_blim) { if (!dm->dqb_bsoftlimit || dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_SPC_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace; } if (check_ilim) { if (!dm->dqb_isoftlimit || dm->dqb_curinodes <= dm->dqb_isoftlimit) { dm->dqb_itime = 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_INO_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace; } if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) clear_bit(DQ_FAKE_B, &dquot->dq_flags); else set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dquot->dq_dqb_lock); ret = mark_dquot_dirty(dquot); if (ret < 0) return ret; return 0; } int dquot_set_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *di) { struct dquot *dquot; int rc; dquot = dqget(sb, qid); if (IS_ERR(dquot)) { rc = PTR_ERR(dquot); goto out; } rc = do_set_dqblk(dquot, di); dqput(dquot); out: return rc; } EXPORT_SYMBOL(dquot_set_dqblk); /* Generic routine for getting common part of quota file information */ int dquot_get_state(struct super_block *sb, struct qc_state *state) { struct mem_dqinfo *mi; struct qc_type_state *tstate; struct quota_info *dqopt = sb_dqopt(sb); int type; memset(state, 0, sizeof(*state)); for (type = 0; type < MAXQUOTAS; type++) { if (!sb_has_quota_active(sb, type)) continue; tstate = state->s_state + type; mi = sb_dqopt(sb)->info + type; tstate->flags = QCI_ACCT_ENABLED; spin_lock(&dq_data_lock); if (mi->dqi_flags & DQF_SYS_FILE) tstate->flags |= QCI_SYSFILE; if (mi->dqi_flags & DQF_ROOT_SQUASH) tstate->flags |= QCI_ROOT_SQUASH; if (sb_has_quota_limits_enabled(sb, type)) tstate->flags |= QCI_LIMITS_ENFORCED; tstate->spc_timelimit = mi->dqi_bgrace; tstate->ino_timelimit = mi->dqi_igrace; if (dqopt->files[type]) { tstate->ino = dqopt->files[type]->i_ino; tstate->blocks = dqopt->files[type]->i_blocks; } tstate->nextents = 1; /* We don't know... */ spin_unlock(&dq_data_lock); } return 0; } EXPORT_SYMBOL(dquot_get_state); /* Generic routine for setting common part of quota file information */ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) { struct mem_dqinfo *mi; if ((ii->i_fieldmask & QC_WARNS_MASK) || (ii->i_fieldmask & QC_RT_SPC_TIMER)) return -EINVAL; if (!sb_has_quota_active(sb, type)) return -ESRCH; mi = sb_dqopt(sb)->info + type; if (ii->i_fieldmask & QC_FLAGS) { if ((ii->i_flags & QCI_ROOT_SQUASH && mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) return -EINVAL; } spin_lock(&dq_data_lock); if (ii->i_fieldmask & QC_SPC_TIMER) mi->dqi_bgrace = ii->i_spc_timelimit; if (ii->i_fieldmask & QC_INO_TIMER) mi->dqi_igrace = ii->i_ino_timelimit; if (ii->i_fieldmask & QC_FLAGS) { if (ii->i_flags & QCI_ROOT_SQUASH) mi->dqi_flags |= DQF_ROOT_SQUASH; else mi->dqi_flags &= ~DQF_ROOT_SQUASH; } spin_unlock(&dq_data_lock); mark_info_dirty(sb, type); /* Force write to disk */ return sb->dq_op->write_info(sb, type); } EXPORT_SYMBOL(dquot_set_dqinfo); const struct quotactl_ops dquot_quotactl_sysfile_ops = { .quota_enable = dquot_quota_enable, .quota_disable = dquot_quota_disable, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .get_nextdqblk = dquot_get_next_dqblk, .set_dqblk = dquot_set_dqblk }; EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); /* Filter negative values for non-monotonic counters */ if (value < 0 && (type == DQST_ALLOC_DQUOTS || type == DQST_FREE_DQUOTS)) value = 0; /* Update global table */ dqstats.stat[type] = value; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } static const struct ctl_table fs_dqstats_table[] = { { .procname = "lookups", .data = &dqstats.stat[DQST_LOOKUPS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "drops", .data = &dqstats.stat[DQST_DROPS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "reads", .data = &dqstats.stat[DQST_READS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "writes", .data = &dqstats.stat[DQST_WRITES], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "cache_hits", .data = &dqstats.stat[DQST_CACHE_HITS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "allocated_dquots", .data = &dqstats.stat[DQST_ALLOC_DQUOTS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "free_dquots", .data = &dqstats.stat[DQST_FREE_DQUOTS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, { .procname = "syncs", .data = &dqstats.stat[DQST_SYNCS], .maxlen = sizeof(unsigned long), .mode = 0444, .proc_handler = do_proc_dqstats, }, #ifdef CONFIG_PRINT_QUOTA_WARNING { .procname = "warnings", .data = &flag_print_warnings, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; static int __init dquot_init(void) { int i, ret; unsigned long nr_hash, order; struct shrinker *dqcache_shrinker; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); register_sysctl_init("fs/quota", fs_dqstats_table); dquot_cachep = kmem_cache_create("dquot", sizeof(struct dquot), sizeof(unsigned long) * 4, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_PANIC), NULL); order = 0; dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order); if (!dquot_hash) panic("Cannot create dquot hash table"); ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL, _DQST_DQSTAT_LAST); if (ret) panic("Cannot create dquot stat counters"); /* Find power-of-two hlist_heads which can fit into allocation */ nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); dq_hash_bits = ilog2(nr_hash); nr_hash = 1UL << dq_hash_bits; dq_hash_mask = nr_hash - 1; for (i = 0; i < nr_hash; i++) INIT_HLIST_HEAD(dquot_hash + i); pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); dqcache_shrinker = shrinker_alloc(0, "dquota-cache"); if (!dqcache_shrinker) panic("Cannot allocate dquot shrinker"); dqcache_shrinker->count_objects = dqcache_shrink_count; dqcache_shrinker->scan_objects = dqcache_shrink_scan; shrinker_register(dqcache_shrinker); return 0; } fs_initcall(dquot_init);
362 203 218 217 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */ #include <linux/cred.h> #include <linux/fs.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> #include <linux/seq_file.h> #include "internal.h" /* * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t, * never from raw values. These are just internal helpers. */ #define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val } #define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val } struct mnt_idmap { struct uid_gid_map uid_map; struct uid_gid_map gid_map; refcount_t count; }; /* * Carries the initial idmapping of 0:0:4294967295 which is an identity * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. */ struct mnt_idmap nop_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(nop_mnt_idmap); /* * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range. * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID. */ struct mnt_idmap invalid_mnt_idmap = { .count = REFCOUNT_INIT(1), }; EXPORT_SYMBOL_GPL(invalid_mnt_idmap); /** * initial_idmapping - check whether this is the initial mapping * @ns: idmapping to check * * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1, * [...], 1000 to 1000 [...]. * * Return: true if this is the initial mapping, false if not. */ static inline bool initial_idmapping(const struct user_namespace *ns) { return ns == &init_user_ns; } /** * make_vfsuid - map a filesystem kuid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kuid : kuid to be mapped * * Take a @kuid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kuid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kuid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kuid won't change when calling * from_kuid() so we can simply retrieve the value via __kuid_val() * directly. * * Return: @kuid mapped according to @idmap. * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is * returned. */ vfsuid_t make_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kuid_t kuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return VFSUIDT_INIT(kuid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSUID; if (initial_idmapping(fs_userns)) uid = __kuid_val(kuid); else uid = from_kuid(fs_userns, kuid); if (uid == (uid_t)-1) return INVALID_VFSUID; return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid)); } EXPORT_SYMBOL_GPL(make_vfsuid); /** * make_vfsgid - map a filesystem kgid according to an idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @kgid : kgid to be mapped * * Take a @kgid and remap it from @fs_userns into @idmap. Use this * function when preparing a @kgid to be reported to userspace. * * If initial_idmapping() determines that this is not an idmapped mount * we can simply return @kgid unchanged. * If initial_idmapping() tells us that the filesystem is not mounted with an * idmapping we know the value of @kgid won't change when calling * from_kgid() so we can simply retrieve the value via __kgid_val() * directly. * * Return: @kgid mapped according to @idmap. * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is * returned. */ vfsgid_t make_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, kgid_t kgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return VFSGIDT_INIT(kgid); if (idmap == &invalid_mnt_idmap) return INVALID_VFSGID; if (initial_idmapping(fs_userns)) gid = __kgid_val(kgid); else gid = from_kgid(fs_userns, kgid); if (gid == (gid_t)-1) return INVALID_VFSGID; return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid)); } EXPORT_SYMBOL_GPL(make_vfsgid); /** * from_vfsuid - map a vfsuid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsuid : vfsuid to be mapped * * Map @vfsuid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsuid to inode->i_uid. * * Return: @vfsuid mapped into the filesystem idmapping */ kuid_t from_vfsuid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsuid_t vfsuid) { uid_t uid; if (idmap == &nop_mnt_idmap) return AS_KUIDT(vfsuid); if (idmap == &invalid_mnt_idmap) return INVALID_UID; uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid)); if (uid == (uid_t)-1) return INVALID_UID; if (initial_idmapping(fs_userns)) return KUIDT_INIT(uid); return make_kuid(fs_userns, uid); } EXPORT_SYMBOL_GPL(from_vfsuid); /** * from_vfsgid - map a vfsgid into the filesystem idmapping * @idmap: the mount's idmapping * @fs_userns: the filesystem's idmapping * @vfsgid : vfsgid to be mapped * * Map @vfsgid into the filesystem idmapping. This function has to be used in * order to e.g. write @vfsgid to inode->i_gid. * * Return: @vfsgid mapped into the filesystem idmapping */ kgid_t from_vfsgid(struct mnt_idmap *idmap, struct user_namespace *fs_userns, vfsgid_t vfsgid) { gid_t gid; if (idmap == &nop_mnt_idmap) return AS_KGIDT(vfsgid); if (idmap == &invalid_mnt_idmap) return INVALID_GID; gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid)); if (gid == (gid_t)-1) return INVALID_GID; if (initial_idmapping(fs_userns)) return KGIDT_INIT(gid); return make_kgid(fs_userns, gid); } EXPORT_SYMBOL_GPL(from_vfsgid); #ifdef CONFIG_MULTIUSER /** * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups * @vfsgid: the mnt gid to match * * This function can be used to determine whether @vfsuid matches any of the * caller's groups. * * Return: 1 if vfsuid matches caller's groups, 0 if not. */ int vfsgid_in_group_p(vfsgid_t vfsgid) { return in_group_p(AS_KGIDT(vfsgid)); } #else int vfsgid_in_group_p(vfsgid_t vfsgid) { return 1; } #endif EXPORT_SYMBOL_GPL(vfsgid_in_group_p); static int copy_mnt_idmap(struct uid_gid_map *map_from, struct uid_gid_map *map_to) { struct uid_gid_extent *forward, *reverse; u32 nr_extents = READ_ONCE(map_from->nr_extents); /* Pairs with smp_wmb() when writing the idmapping. */ smp_rmb(); /* * Don't blindly copy @map_to into @map_from if nr_extents is * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we * read @nr_extents someone could have written an idmapping and * then we might end up with inconsistent data. So just don't do * anything at all. */ if (nr_extents == 0) return -EINVAL; /* * Here we know that nr_extents is greater than zero which means * a map has been written. Since idmappings can't be changed * once they have been written we know that we can safely copy * from @map_to into @map_from. */ if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { *map_to = *map_from; return 0; } forward = kmemdup_array(map_from->forward, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!forward) return -ENOMEM; reverse = kmemdup_array(map_from->reverse, nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL_ACCOUNT); if (!reverse) { kfree(forward); return -ENOMEM; } /* * The idmapping isn't exposed anywhere so we don't need to care * about ordering between extent pointers and @nr_extents * initialization. */ map_to->forward = forward; map_to->reverse = reverse; map_to->nr_extents = nr_extents; return 0; } static void free_mnt_idmap(struct mnt_idmap *idmap) { if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->uid_map.forward); kfree(idmap->uid_map.reverse); } if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(idmap->gid_map.forward); kfree(idmap->gid_map.reverse); } kfree(idmap); } struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) { struct mnt_idmap *idmap; int ret; idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); if (!idmap) return ERR_PTR(-ENOMEM); refcount_set(&idmap->count, 1); ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map); if (!ret) ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map); if (ret) { free_mnt_idmap(idmap); idmap = ERR_PTR(ret); } return idmap; } /** * mnt_idmap_get - get a reference to an idmapping * @idmap: the idmap to bump the reference on * * If @idmap is not the @nop_mnt_idmap bump the reference count. * * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. */ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap) refcount_inc(&idmap->count); return idmap; } EXPORT_SYMBOL_GPL(mnt_idmap_get); /** * mnt_idmap_put - put a reference to an idmapping * @idmap: the idmap to put the reference on * * If this is a non-initial idmapping, put the reference count when a mount is * released and free it if we're the last user. */ void mnt_idmap_put(struct mnt_idmap *idmap) { if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap && refcount_dec_and_test(&idmap->count)) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) { struct uid_gid_map *map, *map_up; u32 idx, nr_mappings; if (!is_valid_mnt_idmap(idmap)) return 0; /* * Idmappings are shown relative to the caller's idmapping. * This is both the most intuitive and most useful solution. */ if (uid_map) { map = &idmap->uid_map; map_up = &current_user_ns()->uid_map; } else { map = &idmap->gid_map; map_up = &current_user_ns()->gid_map; } for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { uid_t lower; struct uid_gid_extent *extent; if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = &map->extent[idx]; else extent = &map->forward[idx]; /* * Verify that the whole range of the mapping can be * resolved in the caller's idmapping. If it cannot be * resolved skip the mapping. */ lower = map_id_range_up(map_up, extent->lower_first, extent->count); if (lower == (uid_t) -1) continue; seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); seq->count++; /* mappings are separated by \0 */ if (seq_has_overflowed(seq)) return -EAGAIN; nr_mappings++; } return nr_mappings; }
18 21 21 21 14 14 12 7 12 7 14 14 14 14 14 1 13 14 13 13 14 14 5 14 14 21 21 21 21 13 13 21 21 21 21 22 22 15 22 22 22 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-only #undef DEBUG /* * ARM performance counter support. * * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles * Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com> * * This code is based on the sparc64 perf event code, which is in turn based * on the x86 code. */ #define pr_fmt(fmt) "hw perfevents: " fmt #include <linux/bitmap.h> #include <linux/cpumask.h> #include <linux/cpu_pm.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/perf/arm_pmu.h> #include <linux/slab.h> #include <linux/sched/clock.h> #include <linux/spinlock.h> #include <linux/irq.h> #include <linux/irqdesc.h> #include <asm/irq_regs.h> static int armpmu_count_irq_users(const int irq); struct pmu_irq_ops { void (*enable_pmuirq)(unsigned int irq); void (*disable_pmuirq)(unsigned int irq); void (*free_pmuirq)(unsigned int irq, int cpu, void __percpu *devid); }; static void armpmu_free_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { free_irq(irq, per_cpu_ptr(devid, cpu)); } static const struct pmu_irq_ops pmuirq_ops = { .enable_pmuirq = enable_irq, .disable_pmuirq = disable_irq_nosync, .free_pmuirq = armpmu_free_pmuirq }; static void armpmu_free_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { free_nmi(irq, per_cpu_ptr(devid, cpu)); } static const struct pmu_irq_ops pmunmi_ops = { .enable_pmuirq = enable_nmi, .disable_pmuirq = disable_nmi_nosync, .free_pmuirq = armpmu_free_pmunmi }; static void armpmu_enable_percpu_pmuirq(unsigned int irq) { enable_percpu_irq(irq, IRQ_TYPE_NONE); } static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu, void __percpu *devid) { if (armpmu_count_irq_users(irq) == 1) free_percpu_irq(irq, devid); } static const struct pmu_irq_ops percpu_pmuirq_ops = { .enable_pmuirq = armpmu_enable_percpu_pmuirq, .disable_pmuirq = disable_percpu_irq, .free_pmuirq = armpmu_free_percpu_pmuirq }; static void armpmu_enable_percpu_pmunmi(unsigned int irq) { if (!prepare_percpu_nmi(irq)) enable_percpu_nmi(irq, IRQ_TYPE_NONE); } static void armpmu_disable_percpu_pmunmi(unsigned int irq) { disable_percpu_nmi(irq); teardown_percpu_nmi(irq); } static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu, void __percpu *devid) { if (armpmu_count_irq_users(irq) == 1) free_percpu_nmi(irq, devid); } static const struct pmu_irq_ops percpu_pmunmi_ops = { .enable_pmuirq = armpmu_enable_percpu_pmunmi, .disable_pmuirq = armpmu_disable_percpu_pmunmi, .free_pmuirq = armpmu_free_percpu_pmunmi }; static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu); static DEFINE_PER_CPU(int, cpu_irq); static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops); static bool has_nmi; static inline u64 arm_pmu_event_max_period(struct perf_event *event) { if (event->hw.flags & ARMPMU_EVT_64BIT) return GENMASK_ULL(63, 0); else if (event->hw.flags & ARMPMU_EVT_63BIT) return GENMASK_ULL(62, 0); else if (event->hw.flags & ARMPMU_EVT_47BIT) return GENMASK_ULL(46, 0); else return GENMASK_ULL(31, 0); } static int armpmu_map_cache_event(const unsigned (*cache_map) [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX], u64 config) { unsigned int cache_type, cache_op, cache_result, ret; cache_type = (config >> 0) & 0xff; if (cache_type >= PERF_COUNT_HW_CACHE_MAX) return -EINVAL; cache_op = (config >> 8) & 0xff; if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) return -EINVAL; cache_result = (config >> 16) & 0xff; if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) return -EINVAL; if (!cache_map) return -ENOENT; ret = (int)(*cache_map)[cache_type][cache_op][cache_result]; if (ret == CACHE_OP_UNSUPPORTED) return -ENOENT; return ret; } static int armpmu_map_hw_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config) { int mapping; if (config >= PERF_COUNT_HW_MAX) return -EINVAL; if (!event_map) return -ENOENT; mapping = (*event_map)[config]; return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping; } static int armpmu_map_raw_event(u32 raw_event_mask, u64 config) { return (int)(config & raw_event_mask); } int armpmu_map_event(struct perf_event *event, const unsigned (*event_map)[PERF_COUNT_HW_MAX], const unsigned (*cache_map) [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask) { u64 config = event->attr.config; int type = event->attr.type; if (type == event->pmu->type) return armpmu_map_raw_event(raw_event_mask, config); switch (type) { case PERF_TYPE_HARDWARE: return armpmu_map_hw_event(event_map, config); case PERF_TYPE_HW_CACHE: return armpmu_map_cache_event(cache_map, config); case PERF_TYPE_RAW: return armpmu_map_raw_event(raw_event_mask, config); } return -ENOENT; } int armpmu_event_set_period(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; u64 max_period; int ret = 0; max_period = arm_pmu_event_max_period(event); if (unlikely(left <= -period)) { left = period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } /* * Limit the maximum period to prevent the counter value * from overtaking the one we are about to program. In * effect we are reducing max_period to account for * interrupt latency (and we are being very conservative). */ if (left > (max_period >> 1)) left = (max_period >> 1); local64_set(&hwc->prev_count, (u64)-left); armpmu->write_counter(event, (u64)(-left) & max_period); perf_event_update_userpage(event); return ret; } u64 armpmu_event_update(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; u64 delta, prev_raw_count, new_raw_count; u64 max_period = arm_pmu_event_max_period(event); again: prev_raw_count = local64_read(&hwc->prev_count); new_raw_count = armpmu->read_counter(event); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; delta = (new_raw_count - prev_raw_count) & max_period; local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); return new_raw_count; } static void armpmu_read(struct perf_event *event) { armpmu_event_update(event); } static void armpmu_stop(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; /* * ARM pmu always has to update the counter, so ignore * PERF_EF_UPDATE, see comments in armpmu_start(). */ if (!(hwc->state & PERF_HES_STOPPED)) { armpmu->disable(event); armpmu_event_update(event); hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; } } static void armpmu_start(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; /* * ARM pmu always has to reprogram the period, so ignore * PERF_EF_RELOAD, see the comment below. */ if (flags & PERF_EF_RELOAD) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; /* * Set the period again. Some counters can't be stopped, so when we * were stopped we simply disabled the IRQ source and the counter * may have been left counting. If we don't do this step then we may * get an interrupt too soon or *way* too late if the overflow has * happened since disabling. */ armpmu_event_set_period(event); armpmu->enable(event); } static void armpmu_del(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; armpmu_stop(event, PERF_EF_UPDATE); hw_events->events[idx] = NULL; armpmu->clear_event_idx(hw_events, event); perf_event_update_userpage(event); /* Clear the allocated counter */ hwc->idx = -1; } static int armpmu_add(struct perf_event *event, int flags) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct hw_perf_event *hwc = &event->hw; int idx; /* An event following a process won't be stopped earlier */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return -ENOENT; /* If we don't have a space for the counter then finish early. */ idx = armpmu->get_event_idx(hw_events, event); if (idx < 0) return idx; /* The newly-allocated counter should be empty */ WARN_ON_ONCE(hw_events->events[idx]); event->hw.idx = idx; hw_events->events[idx] = event; hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; if (flags & PERF_EF_START) armpmu_start(event, PERF_EF_RELOAD); /* Propagate our changes to the userspace mapping. */ perf_event_update_userpage(event); return 0; } static int validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events, struct perf_event *event) { struct arm_pmu *armpmu; if (is_software_event(event)) return 1; /* * Reject groups spanning multiple HW PMUs (e.g. CPU + CCI). The * core perf code won't check that the pmu->ctx == leader->ctx * until after pmu->event_init(event). */ if (event->pmu != pmu) return 0; if (event->state < PERF_EVENT_STATE_OFF) return 1; if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec) return 1; armpmu = to_arm_pmu(event->pmu); return armpmu->get_event_idx(hw_events, event) >= 0; } static int validate_group(struct perf_event *event) { struct perf_event *sibling, *leader = event->group_leader; struct pmu_hw_events fake_pmu; /* * Initialise the fake PMU. We only need to populate the * used_mask for the purposes of validation. */ memset(&fake_pmu.used_mask, 0, sizeof(fake_pmu.used_mask)); if (!validate_event(event->pmu, &fake_pmu, leader)) return -EINVAL; if (event == leader) return 0; for_each_sibling_event(sibling, leader) { if (!validate_event(event->pmu, &fake_pmu, sibling)) return -EINVAL; } if (!validate_event(event->pmu, &fake_pmu, event)) return -EINVAL; return 0; } static irqreturn_t armpmu_dispatch_irq(int irq, void *dev) { struct arm_pmu *armpmu; int ret; u64 start_clock, finish_clock; /* * we request the IRQ with a (possibly percpu) struct arm_pmu**, but * the handlers expect a struct arm_pmu*. The percpu_irq framework will * do any necessary shifting, we just need to perform the first * dereference. */ armpmu = *(void **)dev; if (WARN_ON_ONCE(!armpmu)) return IRQ_NONE; start_clock = sched_clock(); ret = armpmu->handle_irq(armpmu); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); return ret; } static int __hw_perf_event_init(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; int mapping, ret; hwc->flags = 0; mapping = armpmu->map_event(event); if (mapping < 0) { pr_debug("event %x:%llx not supported\n", event->attr.type, event->attr.config); return mapping; } /* * We don't assign an index until we actually place the event onto * hardware. Use -1 to signify that we haven't decided where to put it * yet. For SMP systems, each core has it's own PMU so we can't do any * clever allocation or constraints checking at this point. */ hwc->idx = -1; hwc->config_base = 0; hwc->config = 0; hwc->event_base = 0; /* * Check whether we need to exclude the counter from certain modes. */ if (armpmu->set_event_filter) { ret = armpmu->set_event_filter(hwc, &event->attr); if (ret) return ret; } /* * Store the event encoding into the config_base field. */ hwc->config_base |= (unsigned long)mapping; if (!is_sampling_event(event)) { /* * For non-sampling runs, limit the sample_period to half * of the counter width. That way, the new counter value * is far less likely to overtake the previous one unless * you have some serious IRQ latency issues. */ hwc->sample_period = arm_pmu_event_max_period(event) >> 1; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); } return validate_group(event); } static int armpmu_event_init(struct perf_event *event) { struct arm_pmu *armpmu = to_arm_pmu(event->pmu); /* * Reject CPU-affine events for CPUs that are of a different class to * that which this PMU handles. Process-following events (where * event->cpu == -1) can be migrated between CPUs, and thus we have to * reject them later (in armpmu_add) if they're scheduled on a * different class of CPU. */ if (event->cpu != -1 && !cpumask_test_cpu(event->cpu, &armpmu->supported_cpus)) return -ENOENT; /* does not support taken branch sampling */ if (has_branch_stack(event)) return -EOPNOTSUPP; return __hw_perf_event_init(event); } static void armpmu_enable(struct pmu *pmu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); /* For task-bound events we may be called on other CPUs */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return; if (enabled) armpmu->start(armpmu); } static void armpmu_disable(struct pmu *pmu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); /* For task-bound events we may be called on other CPUs */ if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return; armpmu->stop(armpmu); } /* * In heterogeneous systems, events are specific to a particular * microarchitecture, and aren't suitable for another. Thus, only match CPUs of * the same microarchitecture. */ static bool armpmu_filter(struct pmu *pmu, int cpu) { struct arm_pmu *armpmu = to_arm_pmu(pmu); return !cpumask_test_cpu(cpu, &armpmu->supported_cpus); } static ssize_t cpus_show(struct device *dev, struct device_attribute *attr, char *buf) { struct arm_pmu *armpmu = to_arm_pmu(dev_get_drvdata(dev)); return cpumap_print_to_pagebuf(true, buf, &armpmu->supported_cpus); } static DEVICE_ATTR_RO(cpus); static struct attribute *armpmu_common_attrs[] = { &dev_attr_cpus.attr, NULL, }; static const struct attribute_group armpmu_common_attr_group = { .attrs = armpmu_common_attrs, }; static int armpmu_count_irq_users(const int irq) { int cpu, count = 0; for_each_possible_cpu(cpu) { if (per_cpu(cpu_irq, cpu) == irq) count++; } return count; } static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq) { const struct pmu_irq_ops *ops = NULL; int cpu; for_each_possible_cpu(cpu) { if (per_cpu(cpu_irq, cpu) != irq) continue; ops = per_cpu(cpu_irq_ops, cpu); if (ops) break; } return ops; } void armpmu_free_irq(int irq, int cpu) { if (per_cpu(cpu_irq, cpu) == 0) return; if (WARN_ON(irq != per_cpu(cpu_irq, cpu))) return; per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu); per_cpu(cpu_irq, cpu) = 0; per_cpu(cpu_irq_ops, cpu) = NULL; } int armpmu_request_irq(int irq, int cpu) { int err = 0; const irq_handler_t handler = armpmu_dispatch_irq; const struct pmu_irq_ops *irq_ops; if (!irq) return 0; if (!irq_is_percpu_devid(irq)) { unsigned long irq_flags; err = irq_force_affinity(irq, cpumask_of(cpu)); if (err && num_possible_cpus() > 1) { pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n", irq, cpu); goto err_out; } irq_flags = IRQF_PERCPU | IRQF_NOBALANCING | IRQF_NO_AUTOEN | IRQF_NO_THREAD; err = request_nmi(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_irq(irq, handler, irq_flags, "arm-pmu", per_cpu_ptr(&cpu_armpmu, cpu)); irq_ops = &pmuirq_ops; } else { has_nmi = true; irq_ops = &pmunmi_ops; } } else if (armpmu_count_irq_users(irq) == 0) { err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu); /* If cannot get an NMI, get a normal interrupt */ if (err) { err = request_percpu_irq(irq, handler, "arm-pmu", &cpu_armpmu); irq_ops = &percpu_pmuirq_ops; } else { has_nmi = true; irq_ops = &percpu_pmunmi_ops; } } else { /* Per cpudevid irq was already requested by another CPU */ irq_ops = armpmu_find_irq_ops(irq); if (WARN_ON(!irq_ops)) err = -EINVAL; } if (err) goto err_out; per_cpu(cpu_irq, cpu) = irq; per_cpu(cpu_irq_ops, cpu) = irq_ops; return 0; err_out: pr_err("unable to request IRQ%d for ARM PMU counters\n", irq); return err; } static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu) { struct pmu_hw_events __percpu *hw_events = pmu->hw_events; return per_cpu(hw_events->irq, cpu); } bool arm_pmu_irq_is_nmi(void) { return has_nmi; } /* * PMU hardware loses all context when a CPU goes offline. * When a CPU is hotplugged back in, since some hardware registers are * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading * junk values out of them. */ static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node) { struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node); int irq; if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) return 0; if (pmu->reset) pmu->reset(pmu); per_cpu(cpu_armpmu, cpu) = pmu; irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq); return 0; } static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node) { struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node); int irq; if (!cpumask_test_cpu(cpu, &pmu->supported_cpus)) return 0; irq = armpmu_get_cpu_irq(pmu, cpu); if (irq) per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq); per_cpu(cpu_armpmu, cpu) = NULL; return 0; } #ifdef CONFIG_CPU_PM static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd) { struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); struct perf_event *event; int idx; for_each_set_bit(idx, armpmu->cntr_mask, ARMPMU_MAX_HWEVENTS) { event = hw_events->events[idx]; if (!event) continue; switch (cmd) { case CPU_PM_ENTER: /* * Stop and update the counter */ armpmu_stop(event, PERF_EF_UPDATE); break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: /* * Restore and enable the counter. */ armpmu_start(event, PERF_EF_RELOAD); break; default: break; } } } static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd, void *v) { struct arm_pmu *armpmu = container_of(b, struct arm_pmu, cpu_pm_nb); struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events); bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS); if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus)) return NOTIFY_DONE; /* * Always reset the PMU registers on power-up even if * there are no events running. */ if (cmd == CPU_PM_EXIT && armpmu->reset) armpmu->reset(armpmu); if (!enabled) return NOTIFY_OK; switch (cmd) { case CPU_PM_ENTER: armpmu->stop(armpmu); cpu_pm_pmu_setup(armpmu, cmd); break; case CPU_PM_EXIT: case CPU_PM_ENTER_FAILED: cpu_pm_pmu_setup(armpmu, cmd); armpmu->start(armpmu); break; default: return NOTIFY_DONE; } return NOTIFY_OK; } static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify; return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb); } static void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { cpu_pm_unregister_notifier(&cpu_pmu->cpu_pm_nb); } #else static inline int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { return 0; } static inline void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { } #endif static int cpu_pmu_init(struct arm_pmu *cpu_pmu) { int err; err = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); if (err) goto out; err = cpu_pm_pmu_register(cpu_pmu); if (err) goto out_unregister; return 0; out_unregister: cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); out: return err; } static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu) { cpu_pm_pmu_unregister(cpu_pmu); cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING, &cpu_pmu->node); } struct arm_pmu *armpmu_alloc(void) { struct arm_pmu *pmu; int cpu; pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); if (!pmu) goto out; pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL); if (!pmu->hw_events) { pr_info("failed to allocate per-cpu PMU data.\n"); goto out_free_pmu; } pmu->pmu = (struct pmu) { .pmu_enable = armpmu_enable, .pmu_disable = armpmu_disable, .event_init = armpmu_event_init, .add = armpmu_add, .del = armpmu_del, .start = armpmu_start, .stop = armpmu_stop, .read = armpmu_read, .filter = armpmu_filter, .attr_groups = pmu->attr_groups, /* * This is a CPU PMU potentially in a heterogeneous * configuration (e.g. big.LITTLE) so * PERF_PMU_CAP_EXTENDED_HW_TYPE is required to open * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events on a * specific PMU. */ .capabilities = PERF_PMU_CAP_EXTENDED_REGS | PERF_PMU_CAP_EXTENDED_HW_TYPE, }; pmu->attr_groups[ARMPMU_ATTR_GROUP_COMMON] = &armpmu_common_attr_group; for_each_possible_cpu(cpu) { struct pmu_hw_events *events; events = per_cpu_ptr(pmu->hw_events, cpu); events->percpu_pmu = pmu; } return pmu; out_free_pmu: kfree(pmu); out: return NULL; } void armpmu_free(struct arm_pmu *pmu) { free_percpu(pmu->hw_events); kfree(pmu); } int armpmu_register(struct arm_pmu *pmu) { int ret; ret = cpu_pmu_init(pmu); if (ret) return ret; if (!pmu->set_event_filter) pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE; ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); if (ret) goto out_destroy; pr_info("enabled with %s PMU driver, %d (%*pb) counters available%s\n", pmu->name, bitmap_weight(pmu->cntr_mask, ARMPMU_MAX_HWEVENTS), ARMPMU_MAX_HWEVENTS, &pmu->cntr_mask, has_nmi ? ", using NMIs" : ""); kvm_host_pmu_init(pmu); return 0; out_destroy: cpu_pmu_destroy(pmu); return ret; } static int arm_pmu_hp_init(void) { int ret; ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_STARTING, "perf/arm/pmu:starting", arm_perf_starting_cpu, arm_perf_teardown_cpu); if (ret) pr_err("CPU hotplug notifier for ARM PMU could not be registered: %d\n", ret); return ret; } subsys_initcall(arm_pmu_hp_init);
1548 1548 4 1543 30 1540 1548 54 529 18 65 63 65 64 66 64 1539 599 1278 588 12 591 539 53 1541 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/realpath.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/magic.h> #include <linux/proc_fs.h> /** * tomoyo_encode2 - Encode binary string to ascii string. * * @str: String in binary format. * @str_len: Size of @str in byte. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode2(const char *str, int str_len) { int i; int len = 0; const char *p = str; char *cp; char *cp0; if (!p) return NULL; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') len += 2; else if (c > ' ' && c < 127) len++; else len += 4; } len++; /* Reserve space for appending "/". */ cp = kzalloc(len + 10, GFP_NOFS); if (!cp) return NULL; cp0 = cp; p = str; for (i = 0; i < str_len; i++) { const unsigned char c = p[i]; if (c == '\\') { *cp++ = '\\'; *cp++ = '\\'; } else if (c > ' ' && c < 127) { *cp++ = c; } else { *cp++ = '\\'; *cp++ = (c >> 6) + '0'; *cp++ = ((c >> 3) & 7) + '0'; *cp++ = (c & 7) + '0'; } } return cp0; } /** * tomoyo_encode - Encode binary string to ascii string. * * @str: String in binary format. * * Returns pointer to @str in ascii format on success, NULL otherwise. * * This function uses kzalloc(), so caller must kfree() if this function * didn't return NULL. */ char *tomoyo_encode(const char *str) { return str ? tomoyo_encode2(str, strlen(str)) : NULL; } /** * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root. * * @path: Pointer to "struct path". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { /* go to whatever namespace root we are under */ pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(path->dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_dentry_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. * * If dentry is a directory, trailing '/' is appended. */ static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer, const int buflen) { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { pos = dentry_path_raw(dentry, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = d_backing_inode(dentry); if (inode && S_ISDIR(inode->i_mode)) { buffer[buflen - 2] = '/'; buffer[buflen - 1] = '\0'; } } } return pos; } /** * tomoyo_get_local_path - Get the path of a dentry. * * @dentry: Pointer to "struct dentry". * @buffer: Pointer to buffer to return value in. * @buflen: Sizeof @buffer. * * Returns the buffer on success, an error code otherwise. */ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, const int buflen) { struct super_block *sb = dentry->d_sb; char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen); if (IS_ERR(pos)) return pos; /* Convert from $PID to self if $PID is current thread. */ if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') { char *ep; const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10); struct pid_namespace *proc_pidns = proc_pid_ns(sb); if (*ep == '/' && pid && pid == task_tgid_nr_ns(current, proc_pidns)) { pos = ep - 5; if (pos < buffer) goto out; memmove(pos, "/self", 5); } goto prepend_filesystem_name; } /* Use filesystem name for unnamed devices. */ if (!MAJOR(sb->s_dev)) goto prepend_filesystem_name; { struct inode *inode = d_backing_inode(sb->s_root); /* * Use filesystem name if filesystem does not support rename() * operation. */ if (!inode->i_op->rename) goto prepend_filesystem_name; } /* Prepend device name. */ { char name[64]; int name_len; const dev_t dev = sb->s_dev; name[sizeof(name) - 1] = '\0'; snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev), MINOR(dev)); name_len = strlen(name); pos -= name_len; if (pos < buffer) goto out; memmove(pos, name, name_len); return pos; } /* Prepend filesystem name. */ prepend_filesystem_name: { const char *name = sb->s_type->name; const int name_len = strlen(name); pos -= name_len + 1; if (pos < buffer) goto out; memmove(pos, name, name_len); pos[name_len] = ':'; } return pos; out: return ERR_PTR(-ENOMEM); } /** * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root. * * @path: Pointer to "struct path". * * Returns the realpath of the given @path on success, NULL otherwise. * * If dentry is a directory, trailing '/' is appended. * Characters out of 0x20 < c < 0x7F range are converted to * \ooo style octal string. * Character \ is converted to \\ string. * * These functions use kzalloc(), so the caller must call kfree() * if these functions didn't return NULL. */ char *tomoyo_realpath_from_path(const struct path *path) { char *buf = NULL; char *name = NULL; unsigned int buf_len = PAGE_SIZE / 2; struct dentry *dentry = path->dentry; struct super_block *sb = dentry->d_sb; while (1) { char *pos; struct inode *inode; buf_len <<= 1; kfree(buf); buf = kmalloc(buf_len, GFP_NOFS); if (!buf) break; /* To make sure that pos is '\0' terminated. */ buf[buf_len - 1] = '\0'; /* For "pipe:[\$]" and "socket:[\$]". */ if (dentry->d_op && dentry->d_op->d_dname) { pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1); goto encode; } inode = d_backing_inode(sb->s_root); /* * Get local name for filesystems without rename() operation */ if ((!inode->i_op->rename && !(sb->s_type->fs_flags & FS_REQUIRES_DEV))) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ else { pos = tomoyo_get_absolute_path(path, buf, buf_len - 1); /* * Fall back to local name if absolute name is not * available. */ if (pos == ERR_PTR(-EINVAL)) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); } encode: if (IS_ERR(pos)) continue; name = tomoyo_encode(pos); break; } kfree(buf); if (!name) tomoyo_warn_oom(__func__); return name; } /** * tomoyo_realpath_nofollow - Get realpath of a pathname. * * @pathname: The pathname to solve. * * Returns the realpath of @pathname on success, NULL otherwise. */ char *tomoyo_realpath_nofollow(const char *pathname) { struct path path; if (pathname && kern_path(pathname, 0, &path) == 0) { char *buf = tomoyo_realpath_from_path(&path); path_put(&path); return buf; } return NULL; }
6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif
100 101 100 759 101 101 191 191 189 191 190 190 190 29 29 33 14 29 9 22 190 191 190 29 33 29 35 29 29 223 2 222 223 204 204 2 2 2 2 377 375 313 314 101 629 335 376 729 729 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/namespace.c * * (C) Copyright Al Viro 2000, 2001 * * Based on code from fs/super.c, copyright Linus Torvalds and others. * Heavily rewritten. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/capability.h> #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/file.h> #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> #include <linux/pidfs.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __ro_after_init; static unsigned int m_hash_shift __ro_after_init; static unsigned int mp_hash_mask __ro_after_init; static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { if (!str) return 0; mhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mhash_entries=", set_mhash_entries); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { if (!str) return 0; mphash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mphash_entries=", set_mphash_entries); static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ #define MNT_UNIQUE_ID_OFFSET (1ULL << 31) static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET; static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ enum mount_kattr_flags_t { MOUNT_KATTR_RECURSE = (1 << 0), MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), }; struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; /* /sys/fs */ struct kobject *fs_kobj __ro_after_init; EXPORT_SYMBOL_GPL(fs_kobj); /* * vfsmount lock may be taken for read to prevent changes to the * vfsmount hash, ie. during mountpoint lookups or walking back * up the tree. * * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { if (!node) return NULL; return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); } static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) { struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_b = node_to_mnt_ns(b); u64 seq_a = ns_a->seq; u64 seq_b = ns_b->seq; if (seq_a < seq_b) return -1; if (seq_a > seq_b) return 1; return 0; } static inline void mnt_ns_tree_write_lock(void) { write_seqlock(&mnt_ns_tree_lock); } static inline void mnt_ns_tree_write_unlock(void) { write_sequnlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) { struct rb_node *node, *prev; mnt_ns_tree_write_lock(); node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); /* * If there's no previous entry simply add it after the * head and if there is add it after the previous entry. */ prev = rb_prev(&ns->mnt_ns_tree_node); if (!prev) list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); else list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); mnt_ns_tree_write_unlock(); WARN_ON_ONCE(node); } static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) static void mnt_ns_release_rcu(struct rcu_head *rcu) { mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); } static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (!is_anon_ns(ns)) { mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); list_bidir_del_rcu(&ns->mnt_ns_list); mnt_ns_tree_write_unlock(); } call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); } static int mnt_ns_find(const void *key, const struct rb_node *node) { const u64 mnt_ns_id = *(u64 *)key; const struct mnt_namespace *ns = node_to_mnt_ns(node); if (mnt_ns_id < ns->seq) return -1; if (mnt_ns_id > ns->seq) return 1; return 0; } /* * Lookup a mount namespace by id and take a passive reference count. Taking a * passive reference means the mount namespace can be emptied if e.g., the last * task holding an active reference exits. To access the mounts of the * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. * * Note the lookup is lockless protected by a sequence counter. We only * need to guard against false negatives as false positives aren't * possible. So if we didn't find a mount namespace and the sequence * counter has changed we need to retry. If the sequence counter is * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { struct mnt_namespace *ns; struct rb_node *node; unsigned int seq; guard(rcu)(); do { seq = read_seqbegin(&mnt_ns_tree_lock); node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); if (node) break; } while (read_seqretry(&mnt_ns_tree_lock, seq)); if (!node) return NULL; /* * The last reference count is put with RCU delay so we can * unconditonally acquire a reference here. */ ns = node_to_mnt_ns(node); refcount_inc(&ns->passive); return ns; } static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); } static inline void unlock_mount_hash(void) { write_sequnlock(&mount_lock); } static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> m_hash_shift); return &mount_hashtable[tmp & m_hash_mask]; } static inline struct hlist_head *mp_hash(struct dentry *dentry) { unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> mp_hash_shift); return &mountpoint_hashtable[tmp & mp_hash_mask]; } static int mnt_alloc_id(struct mount *mnt) { int res; xa_lock(&mnt_id_xa); res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); if (!res) mnt->mnt_id_unique = ++mnt_id_ctr; xa_unlock(&mnt_id_xa); return res; } static void mnt_free_id(struct mount *mnt) { xa_erase(&mnt_id_xa, mnt->mnt_id); } /* * Allocate a new peer group ID */ static int mnt_alloc_group_id(struct mount *mnt) { int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL); if (res < 0) return res; mnt->mnt_group_id = res; return 0; } /* * Release a peer group ID */ void mnt_release_group_id(struct mount *mnt) { ida_free(&mnt_group_ida, mnt->mnt_group_id); mnt->mnt_group_id = 0; } /* * vfsmount lock must be held for read */ static inline void mnt_add_count(struct mount *mnt, int n) { #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); #else preempt_disable(); mnt->mnt_count += n; preempt_enable(); #endif } /* * vfsmount lock must be held for write */ int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; } return count; #else return mnt->mnt_count; #endif } static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; err = mnt_alloc_id(mnt); if (err) goto out_free_cache; if (name) mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); else mnt->mnt_devname = "none"; if (!mnt->mnt_devname) goto out_free_id; #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); if (!mnt->mnt_pcp) goto out_free_devname; this_cpu_add(mnt->mnt_pcp->mnt_count, 1); #else mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; #ifdef CONFIG_SMP out_free_devname: kfree_const(mnt->mnt_devname); #endif out_free_id: mnt_free_id(mnt); out_free_cache: kmem_cache_free(mnt_cache, mnt); return NULL; } /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). * We must keep track of when those operations start * (for permission checks) and when they end, so that * we can determine when writes are able to occur to * a filesystem. */ /* * __mnt_is_readonly: check whether a mount is read-only * @mnt: the mount to check for its write status * * This shouldn't be used directly ouside of the VFS. * It does not guarantee that the filesystem will stay * r/w, just that it is right *now*. This can not and * should not be used in place of IS_RDONLY(inode). * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ bool __mnt_is_readonly(struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(__mnt_is_readonly); static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_inc(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers++; #endif } static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_dec(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers--; #endif } static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; } return count; #else return mnt->mnt_writers; #endif } static int mnt_is_readonly(struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; /* * The barrier pairs with the barrier in sb_start_ro_state_change() * making sure if we don't see s_readonly_remount set yet, we also will * not see any superblock / mount flag changes done by remount. * It also pairs with the barrier in sb_end_ro_state_change() * assuring that if we see s_readonly_remount already cleared, we will * see the values of superblock / mount flags updated by remount. */ smp_rmb(); return __mnt_is_readonly(mnt); } /* * Most r/o & frozen checks on a fs are for operations that take discrete * amounts of time, like a write() or unlink(). We must keep track of when * those operations start (for permission checks) and when they end, so that we * can determine when writes are able to occur to a filesystem. */ /** * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass * MNT_WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task * setting MNT_WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting * MNT_WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); lock_mount_hash(); unlock_mount_hash(); preempt_disable(); } } /* * The barrier pairs with the barrier sb_start_ro_state_change() making * sure that if we see MNT_WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. */ smp_rmb(); if (mnt_is_readonly(m)) { mnt_dec_writers(mnt); ret = -EROFS; } preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mount is read-write, filesystem * is not frozen) before returning success. When the write operation is * finished, mnt_drop_write() must be called. This is effectively a refcount. */ int mnt_want_write(struct vfsmount *m) { int ret; sb_start_write(m->mnt_sb); ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); /** * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be * paired with mnt_put_write_access_file. */ int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* * Superblock may have become readonly while there are still * writable fd's, e.g. due to a fs error with errors=remount-ro */ if (__mnt_is_readonly(file->f_path.mnt)) return -EROFS; return 0; } return mnt_get_write_access(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but if the file is already open for writing it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the freeze protection and the check for emergency r/o * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { int ret; sb_start_write(file_inode(file)->i_sb); ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write_file); /** * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with * mnt_get_write_access() call above. */ void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done performing writes to it and * also allows filesystem to be frozen again. Must be matched with * mnt_want_write() call above. */ void mnt_drop_write(struct vfsmount *mnt) { mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); /** * mnt_hold_writers - prevent write access to the given mount * @mnt: mnt to prevent write access to * * Prevents write access to @mnt if there are no active writers for @mnt. * This function needs to be called and return successfully before changing * properties of @mnt that need to remain stable for callers with write access * to @mnt. * * After this functions has been called successfully callers must pair it with * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * * Context: This function expects lock_mount_hash() to be held serializing * setting MNT_WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); /* * With writers on hold, if this value is zero, then there are * definitely no active writers (although held writers may subsequently * increment the count, they'll have to wait, and decrement it after * seeing MNT_READONLY). * * It is OK to have counter incremented on one CPU and decremented on * another: the sum will add up correctly. The danger would be when we * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. * MNT_WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on * MNT_WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) return -EBUSY; return 0; } /** * mnt_unhold_writers - stop preventing write access to the given mount * @mnt: mnt to stop preventing write access to * * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * * This function can only be called after a successful call to * mnt_hold_writers(). * * Context: This function expects lock_mount_hash() to be held. */ static inline void mnt_unhold_writers(struct mount *mnt) { /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } static int mnt_make_readonly(struct mount *mnt) { int ret; ret = mnt_hold_writers(mnt); if (!ret) mnt->mnt.mnt_flags |= MNT_READONLY; mnt_unhold_writers(mnt); return ret; } int sb_prepare_remount_readonly(struct super_block *sb) { struct mount *mnt; int err = 0; /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; lock_mount_hash(); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { err = mnt_hold_writers(mnt); if (err) break; } } if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; if (!err) sb_start_ro_state_change(sb); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } unlock_mount_hash(); return err; } static void free_vfsmnt(struct mount *mnt) { mnt_idmap_put(mnt_idmap(&mnt->mnt)); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } static void delayed_free_vfsmnt(struct rcu_head *head) { free_vfsmnt(container_of(head, struct mount, mnt_rcu)); } /* call under rcu_read_lock */ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { struct mount *mnt; if (read_seqretry(&mount_lock, seq)) return 1; if (bastard == NULL) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; lock_mount_hash(); if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; } unlock_mount_hash(); /* caller will mntput() */ return -1; } /* call under rcu_read_lock */ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) return true; if (unlikely(res < 0)) { rcu_read_unlock(); mntput(bastard); rcu_read_lock(); } return false; } /** * __lookup_mnt - find first child mount * @mnt: parent mount * @dentry: mountpoint * * If @mnt has a child mount @c mounted @dentry find and return it. * * Note that the child mount @c need not be unique. There are cases * where shadow mounts are created. For example, during mount * propagation when a source mount @mnt whose root got overmounted by a * mount @o after path lookup but before @namespace_sem could be * acquired gets copied and propagated. So @mnt gets copied including * @o. When @mnt is propagated to a destination mount @d that already * has another mount @n mounted at the same mountpoint then the source * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt * on @dentry. * * Return: The first child of @mnt mounted @dentry or NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct hlist_head *head = m_hash(mnt, dentry); struct mount *p; hlist_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) return p; return NULL; } /* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the * following mounts: * * mount /dev/sda1 /mnt * mount /dev/sda2 /mnt * mount /dev/sda3 /mnt * * Then lookup_mnt() on the base /mnt dentry in the root mount will * return successively the root dentry and vfsmount of /dev/sda1, then * /dev/sda2, then /dev/sda3, then NULL. * * lookup_mnt takes a reference to the found vfsmount. */ struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; unsigned seq; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); child_mnt = __lookup_mnt(path->mnt, path->dentry); m = child_mnt ? &child_mnt->mnt : NULL; } while (!legitimize_mnt(m, seq)); rcu_read_unlock(); return m; } /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. * * The common case is dentries are not mountpoints at all and that * test is handled inline. For the slow case when we are actually * dealing with a mountpoint of some kind, walk through all of the * mounts in the current mount namespace and test to see if the dentry * is a mountpoint. * * The mount_hashtable is not usable in the context because we * need to identify all mounts that may be in the current mount * namespace not just a mount that happens to have some specified * parent mount. */ bool __is_local_mountpoint(struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; bool is_covered = false; down_read(&namespace_sem); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } up_read(&namespace_sem); return is_covered; } static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { mp->m_count++; return mp; } } return NULL; } static struct mountpoint *get_mountpoint(struct dentry *dentry) { struct mountpoint *mp, *new = NULL; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) return ERR_PTR(-ENOENT); mountpoint: read_seqlock_excl(&mount_lock); mp = lookup_mountpoint(dentry); read_sequnlock_excl(&mount_lock); if (mp) goto done; } if (!new) new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); /* Someone else set d_mounted? */ if (ret == -EBUSY) goto mountpoint; /* The dentry is not available as a mountpoint? */ mp = ERR_PTR(ret); if (ret) goto done; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); new->m_dentry = dget(dentry); new->m_count = 1; hlist_add_head(&new->m_hash, mp_hash(dentry)); INIT_HLIST_HEAD(&new->m_list); read_sequnlock_excl(&mount_lock); mp = new; new = NULL; done: kfree(new); return mp; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); dput_to_list(dentry, list); hlist_del(&mp->m_hash); kfree(mp); } } /* called with namespace_lock and vfsmount lock */ static void put_mountpoint(struct mountpoint *mp) { __put_mountpoint(mp, &ex_mountpoints); } static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } static inline bool check_anonymous_mnt(struct mount *mnt) { u64 seq; if (!is_anon_ns(mnt->mnt_ns)) return false; seq = mnt->mnt_ns->seq_origin; return !seq || (seq == current->nsproxy->mnt_ns->seq); } /* * vfsmount lock must be held for write */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { ns->event = ++event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { ns->event = event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static struct mountpoint *unhash_mnt(struct mount *mnt) { struct mountpoint *mp; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; return mp; } /* * vfsmount lock must be held for write */ static void umount_mnt(struct mount *mnt) { put_mountpoint(unhash_mnt(mnt)); } /* * vfsmount lock must be held for write */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { mp->m_count++; mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } /** * mnt_set_mountpoint_beneath - mount a mount beneath another one * * @new_parent: the source mount * @top_mnt: the mount beneath which @new_parent is mounted * @new_mp: the new mountpoint of @top_mnt on @new_parent * * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and * parent @top_mnt->mnt_parent and mount it on top of @new_parent at * @new_mp. And mount @new_parent on the old parent and old * mountpoint of @top_mnt. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void mnt_set_mountpoint_beneath(struct mount *new_parent, struct mount *top_mnt, struct mountpoint *new_mp) { struct mount *old_top_parent = top_mnt->mnt_parent; struct mountpoint *old_top_mp = top_mnt->mnt_mp; mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); mnt_change_mountpoint(new_parent, new_mp, top_mnt); } static void __attach_mnt(struct mount *mnt, struct mount *parent) { hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /** * attach_mnt - mount a mount, attach to @mount_hashtable and parent's * list of child mounts * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint * @beneath: whether to mount @mnt beneath or on top of @parent * * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * * If @beneath is true, remove @mnt from its current parent and * mountpoint and mount it on @mp on @parent, and mount @parent on the * old parent and old mountpoint of @mnt. Finally, attach @parent to * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. * * Note, when __attach_mnt() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, struct mountpoint *mp, bool beneath) { if (beneath) mnt_set_mountpoint_beneath(mnt, parent, mp); else mnt_set_mountpoint(parent, mp, mnt); /* * Note, @mnt->mnt_parent has to be used. If @mnt was mounted * beneath @parent then @mnt will need to be attached to * @parent's old parent, not @parent. IOW, @mnt->mnt_parent * isn't the same mount as @parent. */ __attach_mnt(mnt, mnt->mnt_parent); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); attach_mnt(mnt, parent, mp, false); put_mountpoint(old_mp); mnt_add_count(old_parent, -1); } static inline struct mount *node_to_mount(struct rb_node *node) { return node ? rb_entry(node, struct mount, mnt_node) : NULL; } static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) { struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; bool mnt_first_node = true, mnt_last_node = true; WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { link = &parent->rb_left; mnt_last_node = false; } else { link = &parent->rb_right; mnt_first_node = false; } } if (mnt_last_node) ns->mnt_last_node = &mnt->mnt_node; if (mnt_first_node) ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); mnt_notify_add(mnt); } /* * vfsmount lock must be held for write */ static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; LIST_HEAD(head); struct mnt_namespace *n = parent->mnt_ns; BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); while (!list_empty(&head)) { m = list_first_entry(&head, typeof(*m), mnt_list); list_del(&m->mnt_list); mnt_add_to_ns(n, m); } n->nr_mounts += n->pending_mounts; n->pending_mounts = 0; __attach_mnt(mnt, parent); touch_mnt_namespace(n); } static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { while (1) { if (p == root) return NULL; next = p->mnt_child.next; if (next != &p->mnt_parent->mnt_mounts) break; p = p->mnt_parent; } } return list_entry(next, struct mount, mnt_child); } static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; } /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached * * Create a mount to an already configured superblock. If necessary, the * caller should invoke vfs_get_tree() before calling this. * * Note that this does not attach the mount to anything. */ struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; if (!fc->root) return ERR_PTR(-EINVAL); mnt = alloc_vfsmnt(fc->source); if (!mnt) return ERR_PTR(-ENOMEM); if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; atomic_inc(&fc->root->d_sb->s_active); mnt->mnt.mnt_sb = fc->root->d_sb; mnt->mnt.mnt_root = dget(fc->root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); struct vfsmount *fc_mount(struct fs_context *fc) { int err = vfs_get_tree(fc); if (!err) { up_write(&fc->root->d_sb->s_umount); return vfs_create_mount(fc); } return ERR_PTR(err); } EXPORT_SYMBOL(fc_mount); struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct fs_context *fc; struct vfsmount *mnt; int ret = 0; if (!type) return ERR_PTR(-EINVAL); fc = fs_context_for_mount(type, flags); if (IS_ERR(fc)) return ERR_CAST(fc); if (name) ret = vfs_parse_fs_string(fc, "source", name, strlen(name)); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) mnt = fc_mount(fc); else mnt = ERR_PTR(ret); put_fs_context(fc); return mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { struct super_block *sb = old->mnt.mnt_sb; struct mount *mnt; int err; mnt = alloc_vfsmnt(old->mnt_devname); if (!mnt) return ERR_PTR(-ENOMEM); if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { err = mnt_alloc_group_id(mnt); if (err) goto out_free; } mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); if ((flag & CL_SLAVE) || ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); } else if (!(flag & CL_PRIVATE)) { if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) list_add(&mnt->mnt_share, &old->mnt_share); if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; } else { CLEAR_MNT_SHARED(mnt); } if (flag & CL_MAKE_SHARED) set_mnt_shared(mnt); /* stick the duplicate mount on the same expiry list * as the original if that was on one */ if (flag & CL_EXPIRE) { if (!list_empty(&old->mnt_expire)) list_add(&mnt->mnt_expire, &old->mnt_expire); } return mnt; out_free: mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } static void cleanup_mnt(struct mount *mnt) { struct hlist_node *p; struct mount *m; /* * The warning here probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this happens, the * filesystem was probably unable to make r/w->r/o transitions. * The locking used to deal with mnt_count decrement provides barriers, * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } static void __cleanup_mnt(struct rcu_head *head) { cleanup_mnt(container_of(head, struct mount, mnt_rcu)); } static LLIST_HEAD(delayed_mntput_list); static void delayed_mntput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_mntput_list); struct mount *m, *t; llist_for_each_entry_safe(m, t, node, mnt_llist) cleanup_mnt(m); } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); int count; rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { /* * Since we don't do lock_mount_hash() here, * ->mnt_ns can change under us. However, if it's * non-NULL, then there's a reference that won't * be dropped until after an RCU delay done after * turning ->mnt_ns NULL. So if we observe it * non-NULL under rcu_read_lock(), the reference * we are dropping is not the final one. */ mnt_add_count(mnt, -1); rcu_read_unlock(); return; } lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab * mount_lock, we'll see their refcount increment here. */ smp_mb(); mnt_add_count(mnt, -1); count = mnt_get_count(mnt); if (count != 0) { WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); return; } mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); list_del(&mnt->mnt_instance); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { __put_mountpoint(unhash_mnt(p), &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } unlock_mount_hash(); shrink_dentry_list(&list); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) schedule_delayed_work(&delayed_mntput_work, 1); return; } cleanup_mnt(mnt); } void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); /* avoid cacheline pingpong */ if (unlikely(m->mnt_expiry_mark)) WRITE_ONCE(m->mnt_expiry_mark, 0); mntput_no_expire(m); } } EXPORT_SYMBOL(mntput); struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) mnt_add_count(real_mount(mnt), 1); return mnt; } EXPORT_SYMBOL(mntget); /* * Make a mount point inaccessible to new lookups. * Because there may still be current users, the caller MUST WAIT * for an RCU grace period before destroying the mount point. */ void mnt_make_shortterm(struct vfsmount *mnt) { if (mnt) real_mount(mnt)->mnt_ns = NULL; } /** * path_is_mountpoint() - Check if path is a mount in the current namespace. * @path: path to check * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. * d_mountpoint() isn't aware of the possibility there may be multiple * mounts using a given dentry in a different namespace. This function * checks if the passed in path is a mountpoint rather than the dentry * alone. */ bool path_is_mountpoint(const struct path *path) { unsigned seq; bool res; if (!d_mountpoint(path->dentry)) return false; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); res = __path_is_mountpoint(path); } while (read_seqretry(&mount_lock, seq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL(path_is_mountpoint); struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); if (IS_ERR(p)) return ERR_CAST(p); p->mnt.mnt_flags |= MNT_INTERNAL; return &p->mnt; } /* * Returns the mount which either has the specified mnt_id, or has the next * smallest id afer the specified one. */ static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) { struct rb_node *node = ns->mounts.rb_node; struct mount *ret = NULL; while (node) { struct mount *m = node_to_mount(node); if (mnt_id <= m->mnt_id_unique) { ret = node_to_mount(node); if (mnt_id == m->mnt_id_unique) break; node = node->rb_left; } else { node = node->rb_right; } } return ret; } /* * Returns the mount which either has the specified mnt_id, or has the next * greater id before the specified one. */ static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id) { struct rb_node *node = ns->mounts.rb_node; struct mount *ret = NULL; while (node) { struct mount *m = node_to_mount(node); if (mnt_id >= m->mnt_id_unique) { ret = node_to_mount(node); if (mnt_id == m->mnt_id_unique) break; node = node->rb_right; } else { node = node->rb_left; } } return ret; } #ifdef CONFIG_PROC_FS /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; down_read(&namespace_sem); return mnt_find_id_at(p->ns, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct mount *next = NULL, *mnt = v; struct rb_node *node = rb_next(&mnt->mnt_node); ++*pos; if (node) { next = node_to_mount(node); *pos = next->mnt_id_unique; } return next; } static void m_stop(struct seq_file *m, void *v) { up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; struct mount *r = v; return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show, }; #endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy * @m: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are * busy. */ int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); int actual_refs = 0; int minimum_refs = 0; struct mount *p; BUG_ON(!m); /* write lock needed for mnt_get_count */ lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { actual_refs += mnt_get_count(p); minimum_refs += 2; } unlock_mount_hash(); if (actual_refs > minimum_refs) return 0; return 1; } EXPORT_SYMBOL(may_umount_tree); /** * may_umount - check if a mount point is busy * @mnt: root of mount * * This is called to check if a mount point has any * open files, pwds, chroots or sub mounts. If the * mount has sub mounts this will return busy * regardless of whether the sub mounts are busy. * * Doesn't take quota and stuff into account. IOW, in some cases it will * give false negatives. The main reason why it's here is that we need * a non-destructive way to look for easily umountable filesystems. */ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); lock_mount_hash(); if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; unlock_mount_hash(); up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); #ifdef CONFIG_FSNOTIFY static void mnt_notify(struct mount *p) { if (!p->prev_ns && p->mnt_ns) { fsnotify_mnt_attach(p->mnt_ns, &p->mnt); } else if (p->prev_ns && !p->mnt_ns) { fsnotify_mnt_detach(p->prev_ns, &p->mnt); } else if (p->prev_ns == p->mnt_ns) { fsnotify_mnt_move(p->mnt_ns, &p->mnt); } else { fsnotify_mnt_detach(p->prev_ns, &p->mnt); fsnotify_mnt_attach(p->mnt_ns, &p->mnt); } p->prev_ns = p->mnt_ns; } static void notify_mnt_list(void) { struct mount *m, *tmp; /* * Notify about mounts that were added/reparented/detached/remain * connected after unmount. */ list_for_each_entry_safe(m, tmp, &notify_list, to_notify) { mnt_notify(m); list_del_init(&m->to_notify); } } static bool need_notify_mnt_list(void) { return !list_empty(&notify_list); } #else static void notify_mnt_list(void) { } static bool need_notify_mnt_list(void) { return false; } #endif static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); if (need_notify_mnt_list()) { /* * No point blocking out concurrent readers while notifications * are sent. This will also allow statmount()/listmount() to run * concurrently. */ downgrade_write(&namespace_sem); notify_mnt_list(); up_read(&namespace_sem); } else { up_write(&namespace_sem); } shrink_dentry_list(&list); if (likely(hlist_empty(&head))) return; synchronize_rcu_expedited(); hlist_for_each_entry_safe(m, p, &head, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } } static inline void namespace_lock(void) { down_write(&namespace_sem); } DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, UMOUNT_CONNECTED = 4, }; static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) { /* Leaving mounts connected is only valid for lazy umounts */ if (how & UMOUNT_SYNC) return true; /* A mount without a parent has nothing to be connected to */ if (!mnt_has_parent(mnt)) return true; /* Because the reference counting rules change when mounts are * unmounted and connected, umounted mounts may not be * connected to mounted mounts. */ if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) return true; /* Has it been requested that the mount remain connected? */ if (how & UMOUNT_CONNECTED) return false; /* Is the mount locked such that it needs to remain connected? */ if (IS_MNT_LOCKED(mnt)) return false; /* By default disconnect the mount */ return true; } /* * mount_lock must be held * namespace_sem must be held for write */ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { LIST_HEAD(tmp_list); struct mount *p; if (how & UMOUNT_PROPAGATE) propagate_mount_unlock(mnt); /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; if (mnt_ns_attached(p)) move_from_ns(p, &tmp_list); else list_move(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); } /* Add propagated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { ns->nr_mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { mnt_add_count(p->mnt_parent, -1); if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); } else { umount_mnt(p); } } change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); /* * At this point p->mnt_ns is NULL, notification will be queued * only if * * - p->prev_ns is non-NULL *and* * - p->prev_ns->n_fsnotify_marks is non-NULL * * This will preclude queuing the mount if this is a cleanup * after a failed copy_tree() or destruction of an anonymous * namespace, etc. */ mnt_notify_add(p); } } static void shrink_submounts(struct mount *mnt); static int do_umount_root(struct super_block *sb) { int ret = 0; down_write(&sb->s_umount); if (!sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, SB_RDONLY); if (IS_ERR(fc)) { ret = PTR_ERR(fc); } else { ret = parse_monolithic_mount_data(fc, NULL); if (!ret) ret = reconfigure_super(fc); put_fs_context(fc); } } up_write(&sb->s_umount); return ret; } static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; /* * Allow userspace to request a mountpoint be expired rather than * unmounting unconditionally. Unmount only happens if: * (1) the mark is already set (the mark is cleared by mntput()) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; /* * probably don't strictly need the lock here if we examined * all race cases, but it's a slowpath. */ lock_mount_hash(); if (mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; } /* * If we may have to abort operations to get out of this * mount, and they will themselves hold resources we must * allow the fs to do things. In the Unix tradition of * 'Gee thats tricky lets do it in userspace' the umount_begin * might fail to complete on the first run through as other tasks * must return, and the like. Thats for the mount program to worry * about for the moment. */ if (flags & MNT_FORCE && sb->s_op->umount_begin) { sb->s_op->umount_begin(sb); } /* * No sense to grab the lock for this test, but test itself looks * somewhat bogus. Suggestions for better replacement? * Ho-hum... In principle, we might treat that as umount + switch * to rootfs. GC would eventually take care of the old vfsmount. * Actually it makes sense, especially if rootfs would contain a * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return do_umount_root(sb); } namespace_lock(); lock_mount_hash(); /* Recheck MNT_LOCKED with the locks held */ retval = -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; event++; if (flags & MNT_DETACH) { if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } out: unlock_mount_hash(); namespace_unlock(); return retval; } /* * __detach_mounts - lazily unmount all mounts on the specified dentry * * During unlink, rmdir, and d_drop it is possible to loose the path * to an existing mountpoint, and wind up leaking the mount. * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * * The caller may hold dentry->d_inode->i_mutex. */ void __detach_mounts(struct dentry *dentry) { struct mountpoint *mp; struct mount *mnt; namespace_lock(); lock_mount_hash(); mp = lookup_mountpoint(dentry); if (!mp) goto out_unlock; event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } put_mountpoint(mp); out_unlock: unlock_mount_hash(); namespace_unlock(); } /* * Is the caller allowed to modify his namespace? */ bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } static void warn_mandlock(void) { pr_warn_once("=======================================================\n" "WARNING: The mand mount option has been deprecated and\n" " and is ignored by this kernel. Remove the mand\n" " option from the mount to silence this warning.\n" "=======================================================\n"); } static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; if (!path_mounted(path)) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } // caller is responsible for flags being sane int path_umount(struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; ret = can_umount(path, flags); if (!ret) ret = do_umount(mnt, flags); /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path->dentry); mntput_no_expire(mnt); return ret; } static int ksys_umount(char __user *name, int flags) { int lookup_flags = LOOKUP_MOUNTPOINT; struct path path; int ret; // basic validity checks done first if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (ret) return ret; return path_umount(&path, flags); } SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { return ksys_umount(name, flags); } #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* * The 2.0 compatible umount. No flags. */ SYSCALL_DEFINE1(oldumount, char __user *, name) { return ksys_umount(name, 0); } #endif static bool is_mnt_ns_file(struct dentry *dentry) { struct ns_common *ns; /* Is this a proxy for a mount namespace? */ if (dentry->d_op != &ns_dentry_operations) return false; ns = d_inode(dentry)->i_private; return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { return &mnt->ns; } struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { guard(rcu)(); for (;;) { struct list_head *list; if (previous) list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); else list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); if (list_is_head(list, &mnt_ns_list)) return ERR_PTR(-ENOENT); mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); /* * The last passive reference count is put with RCU * delay so accessing the mount namespace is not just * safe but all relevant members are still valid. */ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) continue; /* * We need an active reference count as we're persisting * the mount namespace and it might already be on its * deathbed. */ if (!refcount_inc_not_zero(&mntns->ns.count)) continue; return mntns; } } struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) { if (!is_mnt_ns_file(dentry)) return NULL; return to_mnt_ns(get_proc_ns(dentry->d_inode)); } static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); if (!mnt_ns) return false; return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, int flag) { struct mount *res, *src_parent, *src_root_child, *src_mnt, *dst_parent, *dst_mnt; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root)) return ERR_PTR(-EINVAL); if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = dst_mnt = clone_mnt(src_root, dentry, flag); if (IS_ERR(dst_mnt)) return dst_mnt; src_parent = src_root; dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) continue; for (src_mnt = src_root_child; src_mnt; src_mnt = next_mnt(src_mnt, src_root_child)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_mnt)) { if (src_mnt->mnt.mnt_flags & MNT_LOCKED) { /* Both unbindable and locked. */ dst_mnt = ERR_PTR(-EPERM); goto out; } else { src_mnt = skip_mnt_tree(src_mnt); continue; } } if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(src_mnt->mnt.mnt_root)) { src_mnt = skip_mnt_tree(src_mnt); continue; } while (src_parent != src_mnt->mnt_parent) { src_parent = src_parent->mnt_parent; dst_mnt = dst_mnt->mnt_parent; } src_parent = src_mnt; dst_parent = dst_mnt; dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag); if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); unlock_mount_hash(); } } return res; out: if (res) { lock_mount_hash(); umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return dst_mnt; } /* Caller should check returned pointer for errors */ struct vfsmount *collect_mounts(const struct path *path) { struct mount *tree; namespace_lock(); if (!check_mnt(real_mount(path->mnt))) tree = ERR_PTR(-EINVAL); else tree = copy_tree(real_mount(path->mnt), path->dentry, CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) return ERR_CAST(tree); return &tree->mnt; } static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); static inline bool must_dissolve(struct mnt_namespace *mnt_ns) { /* * This mount belonged to an anonymous mount namespace * but was moved to a non-anonymous mount namespace and * then unmounted. */ if (unlikely(!mnt_ns)) return false; /* * This mount belongs to a non-anonymous mount namespace * and we know that such a mount can never transition to * an anonymous mount namespace again. */ if (!is_anon_ns(mnt_ns)) { /* * A detached mount either belongs to an anonymous mount * namespace or a non-anonymous mount namespace. It * should never belong to something purely internal. */ VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); return false; } return true; } void dissolve_on_fput(struct vfsmount *mnt) { struct mnt_namespace *ns; struct mount *m = real_mount(mnt); scoped_guard(rcu) { if (!must_dissolve(READ_ONCE(m->mnt_ns))) return; } scoped_guard(namespace_lock, &namespace_sem) { ns = m->mnt_ns; if (!must_dissolve(ns)) return; /* * After must_dissolve() we know that this is a detached * mount in an anonymous mount namespace. * * Now when mnt_has_parent() reports that this mount * tree has a parent, we know that this anonymous mount * tree has been moved to another anonymous mount * namespace. * * So when closing this file we cannot unmount the mount * tree. This will be done when the file referring to * the root of the anonymous mount namespace will be * closed (It could already be closed but it would sync * on @namespace_sem and wait for us to finish.). */ if (mnt_has_parent(m)) return; lock_mount_hash(); umount_tree(m, UMOUNT_CONNECTED); unlock_mount_hash(); } /* Make sure we notice when we leak mounts. */ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); } void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); umount_tree(real_mount(mnt), 0); unlock_mount_hash(); namespace_unlock(); } static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, dentry)) continue; if (child->mnt.mnt_flags & MNT_LOCKED) return true; } return false; } bool has_locked_children(struct mount *mnt, struct dentry *dentry) { bool res; read_seqlock_excl(&mount_lock); res = __has_locked_children(mnt, dentry); read_sequnlock_excl(&mount_lock); return res; } /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. */ static bool check_for_nsfs_mounts(struct mount *subtree) { struct mount *p; bool ret = false; lock_mount_hash(); for (p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) goto out; ret = true; out: unlock_mount_hash(); return ret; } /** * clone_private_mount - create a private clone of a path * @path: path to clone * * This creates a new vfsmount, which will be the clone of @path. The new mount * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * * This assumes caller has called or done the equivalent of may_mount(). * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; guard(rwsem_read)(&namespace_sem); if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); /* * Make sure the source mount is acceptable. * Anything mounted in our mount namespace is allowed. * Otherwise, it must be the root of an anonymous mount * namespace, and we need to make sure no namespace * loops get created. */ if (!check_mnt(old_mnt)) { if (!is_mounted(&old_mnt->mnt) || !is_anon_ns(old_mnt->mnt_ns) || mnt_has_parent(old_mnt)) return ERR_PTR(-EINVAL); if (!check_for_nsfs_mounts(old_mnt)) return ERR_PTR(-EINVAL); } if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); if (__has_locked_children(old_mnt, path->dentry)) return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); if (IS_ERR(new_mnt)) return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; return &new_mnt->mnt; } EXPORT_SYMBOL_GPL(clone_private_mount); int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { struct mount *mnt; int res = f(root, arg); if (res) return res; list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { res = f(&mnt->mnt, arg); if (res) return res; } return 0; } static void lock_mnt_tree(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { int flags = p->mnt.mnt_flags; /* Don't allow unprivileged users to change mount flags */ flags |= MNT_LOCK_ATIME; if (flags & MNT_READONLY) flags |= MNT_LOCK_READONLY; if (flags & MNT_NODEV) flags |= MNT_LOCK_NODEV; if (flags & MNT_NOSUID) flags |= MNT_LOCK_NOSUID; if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ if (list_empty(&p->mnt_expire)) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } } static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) mnt_release_group_id(p); } } static int invent_group_ids(struct mount *mnt, bool recurse) { struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); return err; } } } return 0; } int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); unsigned int mounts = 0; struct mount *p; if (ns->nr_mounts >= max) return -ENOSPC; max -= ns->nr_mounts; if (ns->pending_mounts >= max) return -ENOSPC; max -= ns->pending_mounts; for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; if (mounts > max) return -ENOSPC; ns->pending_mounts += mounts; return 0; } enum mnt_tree_flags_t { MNT_TREE_MOVE = BIT(0), MNT_TREE_BENEATH = BIT(1), MNT_TREE_PROPAGATION = BIT(2), }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath * @dest_mp: the mountpoint @source_mnt will be mounted at * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. * --------------------------------------------------------------------------- * | BIND MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (++) | shared (+) | shared(+++)| invalid | * | | | | | | * |non-shared| shared (+) | private | slave (*) | invalid | * *************************************************************************** * A bind operation clones the source mount and mounts the clone on the * destination mount. * * (++) the cloned mount is propagated to all the mounts in the propagation * tree of the destination mount and the cloned mount is added to * the peer group of the source mount. * (+) the cloned mount is created under the destination mount and is marked * as shared. The cloned mount is added to the peer group of the source * mount. * (+++) the mount is propagated to all the mounts in the propagation tree * of the destination mount and the cloned mount is made slave * of the same master as that of the source mount. The cloned mount * is marked as 'shared and slave'. * (*) the cloned mount is made a slave of the same master as that of the * source mount. * * --------------------------------------------------------------------------- * | MOVE MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (+) | shared (+) | shared(+++) | invalid | * | | | | | | * |non-shared| shared (+*) | private | slave (*) | unbindable | * *************************************************************************** * * (+) the mount is moved to the destination. And is then propagated to * all the mounts in the propagation tree of the destination mount. * (+*) the mount is moved to the destination. * (+++) the mount is moved to the destination and is then propagated to * all the mounts belonging to the destination mount's propagation tree. * the mount is marked as 'shared and slave'. * (*) the mount continues to be a slave at the new location. * * if the source mount is a tree, the operations explained above is * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. * * Context: The function expects namespace_lock() to be held. * Return: If @source_mnt was successfully attached 0 is returned. * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, struct mount *top_mnt, struct mountpoint *dest_mp, enum mnt_tree_flags_t flags) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); struct mnt_namespace *ns = top_mnt->mnt_ns; struct mountpoint *smp; struct mount *child, *dest_mnt, *p; struct hlist_node *n; int err = 0; bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; /* * Preallocate a mountpoint in case the new mounts need to be * mounted beneath mounts on the same mountpoint. */ smp = get_mountpoint(source_mnt->mnt.mnt_root); if (IS_ERR(smp)) return PTR_ERR(smp); /* Is there space to add these mounts to the mount namespace? */ if (!moving) { err = count_mounts(ns, source_mnt); if (err) goto out; } if (beneath) dest_mnt = top_mnt->mnt_parent; else dest_mnt = top_mnt; if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); } lock_mount_hash(); if (err) goto out_cleanup_ids; if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } if (moving) { if (beneath) dest_mp = smp; unhash_mnt(source_mnt); attach_mnt(source_mnt, top_mnt, dest_mp, beneath); mnt_notify_add(source_mnt); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { LIST_HEAD(head); /* move from anon - the caller will destroy */ for (p = source_mnt; p; p = next_mnt(p, source_mnt)) move_from_ns(p, &head); list_del_init(&head); } if (beneath) mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); else mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); if (q) mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED; commit_tree(child); } put_mountpoint(smp); unlock_mount_hash(); return 0; out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); put_mountpoint(smp); read_sequnlock_excl(&mount_lock); return err; } /** * do_lock_mount - lock mount and mountpoint * @path: target path * @beneath: whether the intention is to mount beneath @path * * Follow the mount stack on @path until the top mount @mnt is found. If * the initial @path->{mnt,dentry} is a mountpoint lookup the first * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} * until nothing is stacked on top of it anymore. * * Acquire the inode_lock() on the top mount's ->mnt_root to protect * against concurrent removal of the new mountpoint from another mount * namespace. * * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint * @mp on @mnt->mnt_parent must be acquired. This protects against a * concurrent unlink of @mp->mnt_dentry from another mount namespace * where @mnt doesn't have a child mount mounted @mp. A concurrent * removal of @mnt->mnt_root doesn't matter as nothing will be mounted * on top of it for @beneath. * * In addition, @beneath needs to make sure that @mnt hasn't been * unmounted or moved from its current mountpoint in between dropping * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt * being unmounted would be detected later by e.g., calling * check_mnt(mnt) in the function it's called from. For the @beneath * case however, it's useful to detect it directly in do_lock_mount(). * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. * * Return: Either the target mountpoint on the top mount or the top * mount's mountpoint. */ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) { struct vfsmount *mnt = path->mnt; struct dentry *dentry; struct mountpoint *mp = ERR_PTR(-ENOENT); struct path under = {}; for (;;) { struct mount *m = real_mount(mnt); if (beneath) { path_put(&under); read_seqlock_excl(&mount_lock); under.mnt = mntget(&m->mnt_parent->mnt); under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dentry = under.dentry; } else { dentry = path->dentry; } inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) break; // not to be mounted on if (beneath && unlikely(m->mnt_mountpoint != dentry || &m->mnt_parent->mnt != under.mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); continue; // got moved } mnt = lookup_mnt(path); if (unlikely(mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); path_put(path); path->mnt = mnt; path->dentry = dget(mnt->mnt_root); continue; // got overmounted } mp = get_mountpoint(dentry); if (IS_ERR(mp)) break; if (beneath) { /* * @under duplicates the references that will stay * at least until namespace_unlock(), so the path_put() * below is safe (and OK to do under namespace_lock - * we are not dropping the final references here). */ path_put(&under); } return mp; } namespace_unlock(); inode_unlock(dentry->d_inode); if (beneath) path_put(&under); return mp; } static inline struct mountpoint *lock_mount(struct path *path) { return do_lock_mount(path, false); } static void unlock_mount(struct mountpoint *where) { inode_unlock(where->m_dentry->d_inode); read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); namespace_unlock(); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; if (d_is_dir(mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; return attach_recursive_mnt(mnt, p, mp, 0); } /* * Sanity check the flags to change_mnt_propagation. */ static int flags_to_propagation_type(int ms_flags) { int type = ms_flags & ~(MS_REC | MS_SILENT); /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return 0; /* Only one propagation flag should be set */ if (!is_power_of_2(type)) return 0; return type; } /* * recursively change the type of the mountpoint. */ static int do_change_type(struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; int err = 0; if (!path_mounted(path)) return -EINVAL; type = flags_to_propagation_type(ms_flags); if (!type) return -EINVAL; namespace_lock(); if (!check_mnt(mnt)) { err = -EINVAL; goto out_unlock; } if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) goto out_unlock; } lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); unlock_mount_hash(); out_unlock: namespace_unlock(); return err; } /* may_copy_tree() - check if a mount tree can be copied * @path: path to the mount tree to be copied * * This helper checks if the caller may copy the mount tree starting * from @path->mnt. The caller may copy the mount tree under the * following circumstances: * * (1) The caller is located in the mount namespace of the mount tree. * This also implies that the mount does not belong to an anonymous * mount namespace. * (2) The caller tries to copy an nfs mount referring to a mount * namespace, i.e., the caller is trying to copy a mount namespace * entry from nsfs. * (3) The caller tries to copy a pidfs mount referring to a pidfd. * (4) The caller is trying to copy a mount tree that belongs to an * anonymous mount namespace. * * For that to be safe, this helper enforces that the origin mount * namespace the anonymous mount namespace was created from is the * same as the caller's mount namespace by comparing the sequence * numbers. * * This is not strictly necessary. The current semantics of the new * mount api enforce that the caller must be located in the same * mount namespace as the mount tree it interacts with. Using the * origin sequence number preserves these semantics even for * anonymous mount namespaces. However, one could envision extending * the api to directly operate across mount namespace if needed. * * The ownership of a non-anonymous mount namespace such as the * caller's cannot change. * => We know that the caller's mount namespace is stable. * * If the origin sequence number of the anonymous mount namespace is * the same as the sequence number of the caller's mount namespace. * => The owning namespaces are the same. * * ==> The earlier capability check on the owning namespace of the * caller's mount namespace ensures that the caller has the * ability to copy the mount tree. * * Returns true if the mount tree can be copied, false otherwise. */ static inline bool may_copy_tree(struct path *path) { struct mount *mnt = real_mount(path->mnt); const struct dentry_operations *d_op; if (check_mnt(mnt)) return true; d_op = path->dentry->d_op; if (d_op == &ns_dentry_operations) return true; if (d_op == &pidfs_dentry_operations) return true; if (!is_mounted(path->mnt)) return false; return check_anonymous_mnt(mnt); } static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) return mnt; if (!may_copy_tree(old_path)) return mnt; if (!recurse && __has_locked_children(old, old_path->dentry)) return mnt; if (recurse) mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else mnt = clone_mnt(old, old_path->dentry, 0); if (!IS_ERR(mnt)) mnt->mnt.mnt_flags &= ~MNT_LOCKED; return mnt; } /* * do loopback mount. */ static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path; struct mount *mnt = NULL, *parent; struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; err = -EINVAL; if (mnt_ns_loop(old_path.dentry)) goto out; mp = lock_mount(path); if (IS_ERR(mp)) { err = PTR_ERR(mp); goto out; } parent = real_mount(path->mnt); if (!check_mnt(parent)) goto out2; mnt = __do_loopback(&old_path, recurse); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); goto out2; } err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: unlock_mount(mp); out: path_put(&old_path); return err; } static struct file *open_detached_copy(struct path *path, bool recursive) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; struct file *file; ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); /* * Record the sequence number of the source mount namespace. * This needs to hold namespace_sem to ensure that the mount * doesn't get attached. */ if (is_mounted(path->mnt)) { src_mnt_ns = real_mount(path->mnt)->mnt_ns; if (is_anon_ns(src_mnt_ns)) ns->seq_origin = src_mnt_ns->seq_origin; else ns->seq_origin = src_mnt_ns->seq; } mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); free_mnt_ns(ns); return ERR_CAST(mnt); } lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { mnt_add_to_ns(ns, p); ns->nr_mounts++; } ns->root = mnt; mntget(&mnt->mnt); unlock_mount_hash(); namespace_unlock(); mntput(path->mnt); path->mnt = &mnt->mnt; file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); else file->f_mode |= FMODE_NEED_UNMOUNT; return file; } static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { int ret; struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) return ERR_PTR(-EPERM); ret = user_path_at(dfd, filename, lookup_flags, &path); if (unlikely(ret)) return ERR_PTR(ret); if (detached) return open_detached_copy(&path, flags & AT_RECURSIVE); return dentry_open(&path, O_PATH, current_cred()); } SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { int fd; struct file *file __free(fput) = NULL; file = vfs_open_tree(dfd, filename, flags); if (IS_ERR(file)) return PTR_ERR(file); fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; fd_install(fd, no_free_ptr(file)); return fd; } /* * Don't allow locked mount flags to be cleared. * * No locks need to be held here while testing the various MNT_LOCK * flags because those flags can never be cleared once they are set. */ static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) { unsigned int fl = mnt->mnt.mnt_flags; if ((fl & MNT_LOCK_READONLY) && !(mnt_flags & MNT_READONLY)) return false; if ((fl & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) return false; if ((fl & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) return false; if ((fl & MNT_LOCK_NOEXEC) && !(mnt_flags & MNT_NOEXEC)) return false; if ((fl & MNT_LOCK_ATIME) && ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) return false; return true; } static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) { bool readonly_request = (mnt_flags & MNT_READONLY); if (readonly_request == __mnt_is_readonly(&mnt->mnt)) return 0; if (readonly_request) return mnt_make_readonly(mnt); mnt->mnt.mnt_flags &= ~MNT_READONLY; return 0; } static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf, *mntpath; buf = (char *)__get_free_page(GFP_KERNEL); if (buf) mntpath = d_path(mountpoint, buf, PAGE_SIZE); else mntpath = ERR_PTR(-ENOMEM); if (IS_ERR(mntpath)) mntpath = "(unknown)"; pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", sb->s_type->name, is_mounted(mnt) ? "remounted" : "mounted", mntpath, &sb->s_time_max, (unsigned long long)sb->s_time_max); sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; if (buf) free_page((unsigned long)buf); } } /* * Handle reconfiguration of the mountpoint only without alteration of the * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); int ret; if (!check_mnt(mnt)) return -EINVAL; if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; /* * We're only checking whether the superblock is read-only not * changing it, so only take down_read(&sb->s_umount). */ down_read(&sb->s_umount); lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); return ret; } /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ static int do_remount(struct path *path, int ms_flags, int sb_flags, int mnt_flags, void *data) { int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; if (!path_mounted(path)) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the remount request is coming * from the legacy mount system call. */ fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); if (!err) { lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); unlock_mount_hash(); } } up_write(&sb->s_umount); } mnt_warn_timestamp_expiry(path, &mnt->mnt); put_fs_context(fc); return err; } static inline int tree_contains_unbindable(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; } return 0; } static int do_set_group(struct path *from_path, struct path *to_path) { struct mount *from, *to; int err; from = real_mount(from_path->mnt); to = real_mount(to_path->mnt); namespace_lock(); err = -EINVAL; /* To and From must be mounted */ if (!is_mounted(&from->mnt)) goto out; if (!is_mounted(&to->mnt)) goto out; err = -EPERM; /* We should be allowed to modify mount namespaces of both mounts */ if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) goto out; if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) goto out; err = -EINVAL; /* To and From paths should be mount roots */ if (!path_mounted(from_path)) goto out; if (!path_mounted(to_path)) goto out; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) goto out; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) goto out; /* From mount should not have locked children in place of To's root */ if (__has_locked_children(from, to->mnt.mnt_root)) goto out; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) goto out; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) goto out; if (IS_MNT_SLAVE(from)) { struct mount *m = from->mnt_master; list_add(&to->mnt_slave, &from->mnt_slave); to->mnt_master = m; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); lock_mount_hash(); set_mnt_shared(to); unlock_mount_hash(); } err = 0; out: namespace_unlock(); return err; } /** * path_overmounted - check if path is overmounted * @path: path to check * * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * * Context: namespace_sem must be held at least shared. * MUST NOT be called under lock_mount_hash() (there one should just * call __lookup_mnt() and check if it returns NULL). * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { unsigned seq = read_seqbegin(&mount_lock); bool no_child; rcu_read_lock(); no_child = !__lookup_mnt(path->mnt, path->dentry); rcu_read_unlock(); if (need_seqretry(&mount_lock, seq)) { read_seqlock_excl(&mount_lock); no_child = !__lookup_mnt(path->mnt, path->dentry); read_sequnlock_excl(&mount_lock); } return unlikely(!no_child); } /** * can_move_mount_beneath - check that we can mount beneath the top mount * @from: mount to mount beneath * @to: mount under which to mount * @mp: mountpoint of @to * * - Make sure that @to->dentry is actually the root of a mount under * which we can mount another mount. * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. * - Ensure that nothing has been mounted on top of @from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to * nonsensical mount trees. * * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ static int can_move_mount_beneath(const struct path *from, const struct path *to, const struct mountpoint *mp) { struct mount *mnt_from = real_mount(from->mnt), *mnt_to = real_mount(to->mnt), *parent_mnt_to = mnt_to->mnt_parent; if (!mnt_has_parent(mnt_to)) return -EINVAL; if (!path_mounted(to)) return -EINVAL; if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; /* Avoid creating shadow mounts during mount propagation. */ if (path_overmounted(from)) return -EINVAL; /* * Mounting beneath the rootfs only makes sense when the * semantics of pivot_root(".", ".") are used. */ if (&mnt_to->mnt == current->fs->root.mnt) return -EINVAL; if (parent_mnt_to == current->nsproxy->mnt_ns->root) return -EINVAL; for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) if (p == mnt_to) return -EINVAL; /* * If the parent mount propagates to the child mount this would * mean mounting @mnt_from on @mnt_to->mnt_parent and then * propagating a copy @c of @mnt_from on top of @mnt_to. This * defeats the whole purpose of mounting beneath another mount. */ if (propagation_would_overmount(parent_mnt_to, mnt_to, mp)) return -EINVAL; /* * If @mnt_to->mnt_parent propagates to @mnt_from this would * mean propagating a copy @c of @mnt_from on top of @mnt_from. * Afterwards @mnt_from would be mounted on top of * @mnt_to->mnt_parent and @mnt_to would be unmounted from * @mnt->mnt_parent and remounted on @mnt_from. But since @c is * already mounted on @mnt_from, @mnt_to would ultimately be * remounted on top of @c. Afterwards, @mnt_from would be * covered by a copy @c of @mnt_from and @c would be covered by * @mnt_from itself. This defeats the whole purpose of mounting * @mnt_from beneath @mnt_to. */ if (check_mnt(mnt_from) && propagation_would_overmount(parent_mnt_to, mnt_from, mp)) return -EINVAL; return 0; } /* may_use_mount() - check if a mount tree can be used * @mnt: vfsmount to be used * * This helper checks if the caller may use the mount tree starting * from @path->mnt. The caller may use the mount tree under the * following circumstances: * * (1) The caller is located in the mount namespace of the mount tree. * This also implies that the mount does not belong to an anonymous * mount namespace. * (2) The caller is trying to use a mount tree that belongs to an * anonymous mount namespace. * * For that to be safe, this helper enforces that the origin mount * namespace the anonymous mount namespace was created from is the * same as the caller's mount namespace by comparing the sequence * numbers. * * The ownership of a non-anonymous mount namespace such as the * caller's cannot change. * => We know that the caller's mount namespace is stable. * * If the origin sequence number of the anonymous mount namespace is * the same as the sequence number of the caller's mount namespace. * => The owning namespaces are the same. * * ==> The earlier capability check on the owning namespace of the * caller's mount namespace ensures that the caller has the * ability to use the mount tree. * * Returns true if the mount tree can be used, false otherwise. */ static inline bool may_use_mount(struct mount *mnt) { if (check_mnt(mnt)) return true; /* * Make sure that noone unmounted the target path or somehow * managed to get their hands on something purely kernel * internal. */ if (!is_mounted(&mnt->mnt)) return false; return check_anonymous_mnt(mnt); } static int do_move_mount(struct path *old_path, struct path *new_path, enum mnt_tree_flags_t flags) { struct mnt_namespace *ns; struct mount *p; struct mount *old; struct mount *parent; struct mountpoint *mp, *old_mp; int err; bool attached, beneath = flags & MNT_TREE_BENEATH; mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) return PTR_ERR(mp); old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); parent = old->mnt_parent; attached = mnt_has_parent(old); if (attached) flags |= MNT_TREE_MOVE; old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; /* The thing moved must be mounted... */ if (!is_mounted(&old->mnt)) goto out; if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) goto out; /* ... and the target should be in our namespace */ if (!check_mnt(p)) goto out; } else { /* * otherwise the source must be the root of some anon namespace. * AV: check for mount being root of an anon namespace is worth * an inlined predicate... */ if (!is_anon_ns(ns) || mnt_has_parent(old)) goto out; /* * Bail out early if the target is within the same namespace - * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ if (ns == p->mnt_ns) goto out; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ if (!may_use_mount(p)) goto out; } if (!path_mounted(old_path)) goto out; if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) goto out; /* * Don't move a mount residing in a shared parent. */ if (attached && IS_MNT_SHARED(parent)) goto out; if (beneath) { err = can_move_mount_beneath(old_path, new_path, mp); if (err) goto out; err = -EINVAL; p = p->mnt_parent; flags |= MNT_TREE_BENEATH; } /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) goto out; err = -ELOOP; if (!check_for_nsfs_mounts(old)) goto out; for (; mnt_has_parent(p); p = p->mnt_parent) if (p == old) goto out; err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); if (err) goto out; /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); if (attached) put_mountpoint(old_mp); out: unlock_mount(mp); if (!err) { if (attached) { mntput_no_expire(parent); } else { /* Make sure we notice when we leak mounts. */ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); } } return err; } static int do_move_mount_old(struct path *path, const char *old_name) { struct path old_path; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; err = do_move_mount(&old_path, path, 0); path_put(&old_path); return err; } /* * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, const struct path *path, int mnt_flags) { struct mount *parent = real_mount(path->mnt); mnt_flags &= ~MNT_INTERNAL_FLAGS; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) return -EINVAL; } /* Refuse the same filesystem on the same mount point */ if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; return graft_tree(newmnt, parent, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); /* * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; struct mountpoint *mp; struct super_block *sb = fc->root->d_sb; int error; error = security_sb_kern_mount(sb); if (!error && mount_too_revealing(sb, &mnt_flags)) error = -EPERM; if (unlikely(error)) { fc_drop_locked(fc); return error; } up_write(&sb->s_umount); mnt = vfs_create_mount(fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); mnt_warn_timestamp_expiry(mountpoint, mnt); mp = lock_mount(mountpoint); if (IS_ERR(mp)) { mntput(mnt); return PTR_ERR(mp); } error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); unlock_mount(mp); if (error < 0) mntput(mnt); return error; } /* * create a new mount for userspace and request it to be added into the * namespace's tree */ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; const char *subtype = NULL; int err = 0; if (!fstype) return -EINVAL; type = get_fs_type(fstype); if (!type) return -ENODEV; if (type->fs_flags & FS_HAS_SUBTYPE) { subtype = strchr(fstype, '.'); if (subtype) { subtype++; if (!*subtype) { put_filesystem(type); return -EINVAL; } } } fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); if (IS_ERR(fc)) return PTR_ERR(fc); /* * Indicate to the filesystem that the mount request is coming * from the legacy mount system call. */ fc->oldapi = true; if (subtype) err = vfs_parse_fs_string(fc, "subtype", subtype, strlen(subtype)); if (!err && name) err = vfs_parse_fs_string(fc, "source", name, strlen(name)); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; if (!err) err = vfs_get_tree(fc); if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; struct mountpoint *mp; struct mount *mnt; int err; if (!m) return 0; if (IS_ERR(m)) return PTR_ERR(m); mnt = real_mount(m); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == dentry) { err = -ELOOP; goto discard; } /* * we don't want to use lock_mount() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) { err = -ENOENT; goto discard_locked; } if (path_overmounted(path)) { err = 0; goto discard_locked; } mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); goto discard_locked; } err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); unlock_mount(mp); if (unlikely(err)) goto discard; return 0; discard_locked: namespace_unlock(); inode_unlock(dentry->d_inode); discard: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); list_del_init(&mnt->mnt_expire); namespace_unlock(); } mntput(m); return err; } /** * mnt_set_expiry - Put a mount on an expiration list * @mnt: The mount to list. * @expiry_list: The list to add the mount to. */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { namespace_lock(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); namespace_unlock(); } EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came * here */ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); if (list_empty(mounts)) return; namespace_lock(); lock_mount_hash(); /* extract from the expiration list every vfsmount that matches the * following criteria: * - already mounted * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { if (!is_mounted(&mnt->mnt)) continue; if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } unlock_mount_hash(); namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); /* * Ripoff of 'select_parent()' * * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ static int select_submounts(struct mount *parent, struct list_head *graveyard) { struct mount *this_parent = parent; struct list_head *next; int found = 0; repeat: next = this_parent->mnt_mounts.next; resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. */ if (!list_empty(&mnt->mnt_mounts)) { this_parent = mnt; goto repeat; } if (!propagate_mount_busy(mnt, 1)) { list_move_tail(&mnt->mnt_expire, graveyard); found++; } } /* * All done at this level ... ascend and resume the search */ if (this_parent != parent) { next = this_parent->mnt_child.next; this_parent = this_parent->mnt_parent; goto resume; } return found; } /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } static void *copy_mount_options(const void __user * data) { char *copy; unsigned left, offset; if (!data) return NULL; copy = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!copy) return ERR_PTR(-ENOMEM); left = copy_from_user(copy, data, PAGE_SIZE); /* * Not all architectures have an exact copy_from_user(). Resort to * byte at a time. */ offset = PAGE_SIZE - left; while (left) { char c; if (get_user(c, (const char __user *)data + offset)) break; copy[offset] = c; left--; offset++; } if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } return copy; } static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; int ret; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; if (flags & MS_NOUSER) return -EINVAL; ret = security_sb_mount(dev_name, path, type_page, flags, data_page); if (ret) return ret; if (!may_mount()) return -EPERM; if (flags & SB_MANDLOCK) warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; if (flags & MS_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME)) == 0)) { mnt_flags &= ~MNT_ATIME_MASK; mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; } sb_flags = flags & (SB_RDONLY | SB_SYNCHRONOUS | SB_MANDLOCK | SB_DIRSYNC | SB_SILENT | SB_POSIXACL | SB_LAZYTIME | SB_I_VERSION); if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) return do_remount(path, flags, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return do_change_type(path, flags); if (flags & MS_MOVE) return do_move_mount_old(path, dev_name); return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, data_page); } int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; ret = path_mount(dev_name, &path, type_page, flags, data_page); path_put(&path); return ret; } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); } static void dec_mnt_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); } static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } /* * Assign a sequence number so we can detect when we attempt to bind * mount a reference to an older mount namespace into the current * mount namespace, preventing reference counting loops. A 64bit * number incrementing at 10Ghz will take 12,427 years to wrap which * is effectively never, so we can ignore the possibility. */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; int ret; ucounts = inc_mnt_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } if (!anon) { ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } } new_ns->ns.ops = &mntns_operations; if (!anon) new_ns->seq = atomic64_inc_return(&mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; INIT_LIST_HEAD(&new_ns->mnt_ns_list); RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; } __latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct mount *p, *q; struct mount *old; struct mount *new; int copy_flags; BUG_ON(!ns); if (likely(!(flags & CLONE_NEWNS))) { get_mnt_ns(ns); return ns; } old = ns->root; new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; namespace_lock(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); ns_free_inum(&new_ns->ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); } if (user_ns != ns->user_ns) { lock_mount_hash(); lock_mnt_tree(new); unlock_mount_hash(); } new_ns->root = new; /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ p = old; q = new; while (p) { mnt_add_to_ns(new_ns, q); new_ns->nr_mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); rootmnt = &p->mnt; } if (&p->mnt == new_fs->pwd.mnt) { new_fs->pwd.mnt = mntget(&q->mnt); pwdmnt = &p->mnt; } } p = next_mnt(p, old); q = next_mnt(q, new); if (!q) break; // an mntns binding we'd skipped? while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } namespace_unlock(); if (rootmnt) mntput(rootmnt); if (pwdmnt) mntput(pwdmnt); mnt_ns_tree_add(new_ns); return new_ns; } struct dentry *mount_subtree(struct vfsmount *m, const char *name) { struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) { mntput(m); return ERR_CAST(ns); } ns->root = mnt; ns->nr_mounts++; mnt_add_to_ns(ns, mnt); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); if (err) return ERR_PTR(err); /* trade a vfsmount reference for active sb one */ s = path.mnt->mnt_sb; atomic_inc(&s->s_active); mntput(path.mnt); /* lock the sucker */ down_write(&s->s_umount); /* ... and return the root of (sub)tree on it */ return path.dentry; } EXPORT_SYMBOL(mount_subtree); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; char *kernel_dev; void *options; kernel_type = copy_mount_string(type); ret = PTR_ERR(kernel_type); if (IS_ERR(kernel_type)) goto out_type; kernel_dev = copy_mount_string(dev_name); ret = PTR_ERR(kernel_dev); if (IS_ERR(kernel_dev)) goto out_dev; options = copy_mount_options(data); ret = PTR_ERR(options); if (IS_ERR(options)) goto out_data; ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); kfree(options); out_data: kfree(kernel_dev); out_dev: kfree(kernel_type); out_type: return ret; } #define FSMOUNT_VALID_FLAGS \ (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ MOUNT_ATTR_NOSYMFOLLOW) #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) #define MOUNT_SETATTR_PROPAGATION_FLAGS \ (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) { unsigned int mnt_flags = 0; if (attr_flags & MOUNT_ATTR_RDONLY) mnt_flags |= MNT_READONLY; if (attr_flags & MOUNT_ATTR_NOSUID) mnt_flags |= MNT_NOSUID; if (attr_flags & MOUNT_ATTR_NODEV) mnt_flags |= MNT_NODEV; if (attr_flags & MOUNT_ATTR_NOEXEC) mnt_flags |= MNT_NOEXEC; if (attr_flags & MOUNT_ATTR_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; return mnt_flags; } /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. */ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { struct mnt_namespace *ns; struct fs_context *fc; struct file *file; struct path newmount; struct mount *mnt; unsigned int mnt_flags = 0; long ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: break; case MOUNT_ATTR_NOATIME: mnt_flags |= MNT_NOATIME; break; case MOUNT_ATTR_RELATIME: mnt_flags |= MNT_RELATIME; break; default: return -EINVAL; } CLASS(fd, f)(fs_fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &fscontext_fops) return -EINVAL; fc = fd_file(f)->private_data; ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret < 0) return ret; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; if (!fc->root) goto err_unlock; ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { pr_warn("VFS: Mount too revealing\n"); goto err_unlock; } ret = -EBUSY; if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; if (fc->sb_flags & SB_MANDLOCK) warn_mandlock(); newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { ret = PTR_ERR(newmount.mnt); goto err_unlock; } newmount.dentry = dget(fc->root); newmount.mnt->mnt_flags = mnt_flags; /* We've done the mount bit - now move the file context into more or * less the same state as if we'd done an fspick(). We don't want to * do any memory allocation or anything like that at this point as we * don't want to have to handle any errors incurred. */ vfs_clean_context(fc); ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); if (IS_ERR(ns)) { ret = PTR_ERR(ns); goto err_path; } mnt = real_mount(newmount.mnt); ns->root = mnt; ns->nr_mounts = 1; mnt_add_to_ns(ns, mnt); mntget(newmount.mnt); /* Attach to an apparent O_PATH fd with a note that we need to unmount * it, not just simply put it. */ file = dentry_open(&newmount, O_PATH, fc->cred); if (IS_ERR(file)) { dissolve_on_fput(newmount.mnt); ret = PTR_ERR(file); goto err_path; } file->f_mode |= FMODE_NEED_UNMOUNT; ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); if (ret >= 0) fd_install(ret, file); else fput(file); err_path: path_put(&newmount); err_unlock: mutex_unlock(&fc->uapi_mutex); return ret; } static inline int vfs_move_mount(struct path *from_path, struct path *to_path, enum mnt_tree_flags_t mflags) { int ret; ret = security_move_mount(from_path, to_path); if (ret) return ret; if (mflags & MNT_TREE_PROPAGATION) return do_set_group(from_path, to_path); return do_move_mount(from_path, to_path, mflags); } /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy * a mount subtree. * * Note the flags value is a combination of MOVE_MOUNT_* flags. */ SYSCALL_DEFINE5(move_mount, int, from_dfd, const char __user *, from_pathname, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { struct path to_path __free(path_put) = {}; struct path from_path __free(path_put) = {}; struct filename *to_name __free(putname) = NULL; struct filename *from_name __free(putname) = NULL; unsigned int lflags, uflags; enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) return -EPERM; if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; uflags = 0; if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; from_name = getname_maybe_null(from_pathname, uflags); if (IS_ERR(from_name)) return PTR_ERR(from_name); lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; uflags = 0; if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; to_name = getname_maybe_null(to_pathname, uflags); if (IS_ERR(to_name)) return PTR_ERR(to_name); if (!to_name && to_dfd >= 0) { CLASS(fd_raw, f_to)(to_dfd); if (fd_empty(f_to)) return -EBADF; to_path = fd_file(f_to)->f_path; path_get(&to_path); } else { ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); if (ret) return ret; } if (!from_name && from_dfd >= 0) { CLASS(fd_raw, f_from)(from_dfd); if (fd_empty(f_from)) return -EBADF; return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); } ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); if (ret) return ret; return vfs_move_mount(&from_path, &to_path, mflags); } /* * Return true if path is reachable from root * * namespace_sem or mount_lock is held */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) { while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { dentry = mnt->mnt_mountpoint; mnt = mnt->mnt_parent; } return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } bool path_is_under(const struct path *path1, const struct path *path2) { bool res; read_seqlock_excl(&mount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); read_sequnlock_excl(&mount_lock); return res; } EXPORT_SYMBOL(path_is_under); /* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets * root/cwd of all processes which had them on the current root to new_root. * * Restrictions: * The new_root and put_old must be directories, and must not be on the * same file system as the current process root. The put_old must be * underneath new_root, i.e. adding a non-zero number of /.. to the string * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: * - we don't move root/cwd if they are not at the root (reason: if something * cared enough to change them, it's probably wrong to force them elsewhere) * - it's okay to pick a root that isn't the root of a file system, e.g. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new, old, root; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; struct mountpoint *old_mp, *root_mp; int error; if (!may_mount()) return -EPERM; error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) goto out0; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) goto out1; error = security_sb_pivotroot(&old, &new); if (error) goto out2; get_fs_root(current->fs, &root); old_mp = lock_mount(&old); error = PTR_ERR(old_mp); if (IS_ERR(old_mp)) goto out3; error = -EINVAL; new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); old_mnt = real_mount(old.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) goto out4; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; error = -EBUSY; if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) goto out4; lock_mount_hash(); umount_mnt(new_mnt); root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ attach_mnt(new_mnt, root_parent, root_mp, false); mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); put_mountpoint(root_mp); unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: unlock_mount(old_mp); if (!error) mntput_no_expire(ex_parent); out3: path_put(&root); out2: path_put(&old); out1: path_put(&new); out0: return error; } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) { unsigned int flags = mnt->mnt.mnt_flags; /* flags to clear */ flags &= ~kattr->attr_clr; /* flags to raise */ flags |= kattr->attr_set; return flags; } static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; if (!kattr->mnt_idmap) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ if (kattr->mnt_userns == m->mnt_sb->s_user_ns) return -EINVAL; /* * We only allow an mount to change it's idmapping if it has * never been accessible to userspace. */ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; /* The filesystem has turned off idmapped mounts. */ if (m->mnt_sb->s_iflags & SB_I_NOIDMAP) return -EINVAL; /* We're not controlling the superblock. */ if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ if (!is_anon_ns(mnt->mnt_ns)) return -EINVAL; return 0; } /** * mnt_allow_writers() - check whether the attribute change allows writers * @kattr: the new mount attributes * @mnt: the mount to which @kattr will be applied * * Check whether thew new mount attributes in @kattr allow concurrent writers. * * Return: true if writers need to be held, false if not */ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, const struct mount *mnt) { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && !kattr->mnt_idmap; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; int err; for (m = mnt; m; m = next_mnt(m, mnt)) { if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { err = -EPERM; break; } err = can_idmap_mount(kattr, m); if (err) break; if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); if (err) break; } if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } if (err) { struct mount *p; /* * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all * mounts and needs to take care to include the first mount. */ for (p = mnt; p; p = next_mnt(p, mnt)) { /* If we had to hold writers unblock them. */ if (p->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(p); /* * We're done once the first mount we changed got * MNT_WRITE_HOLD unset. */ if (p == m) break; } } return err; } static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { struct mnt_idmap *old_idmap; if (!kattr->mnt_idmap) return; old_idmap = mnt_idmap(&mnt->mnt); /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) { struct mount *m; for (m = mnt; m; m = next_mnt(m, mnt)) { unsigned int flags; do_idmap_mount(kattr, m); flags = recalc_flags(kattr, m); WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ if (m->mnt.mnt_flags & MNT_WRITE_HOLD) mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); } static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; if (!path_mounted(path)) return -EINVAL; if (kattr->mnt_userns) { struct mnt_idmap *mnt_idmap; mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); if (IS_ERR(mnt_idmap)) return PTR_ERR(mnt_idmap); kattr->mnt_idmap = mnt_idmap; } if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing * propagation. */ namespace_lock(); if (kattr->propagation == MS_SHARED) { err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; } } } err = -EINVAL; lock_mount_hash(); /* Ensure that this isn't anything purely vfs internal. */ if (!is_mounted(&mnt->mnt)) goto out; /* * If this is an attached mount make sure it's located in the callers * mount namespace. If it's not don't let the caller interact with it. * * If this mount doesn't have a parent it's most often simply a * detached mount with an anonymous mount namespace. IOW, something * that's simply not attached yet. But there are apparently also users * that do change mount properties on the rootfs itself. That obviously * neither has a parent nor is it a detached mount so we cannot * unconditionally check for detached mounts. */ if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) goto out; /* * First, we get the mount tree in a shape where we can change mount * properties without failure. If we succeeded to do so we commit all * changes and if we failed we clean up. */ err = mount_setattr_prepare(kattr, mnt); if (!err) mount_setattr_commit(kattr, mnt); out: unlock_mount_hash(); if (kattr->propagation) { if (err) cleanup_group_ids(mnt, NULL); namespace_unlock(); } return err; } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; if (attr->attr_clr & MOUNT_ATTR_IDMAP) { /* * We can only remove an idmapping if it's never been * exposed to userspace. */ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) return -EINVAL; /* * Removal of idmappings is equivalent to setting * nop_mnt_idmap. */ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { kattr->mnt_idmap = &nop_mnt_idmap; return 0; } } if (attr->userns_fd > INT_MAX) return -EINVAL; CLASS(fd, f)(attr->userns_fd); if (fd_empty(f)) return -EBADF; if (!proc_ns_file(fd_file(f))) return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops->type != CLONE_NEWUSER) return -EINVAL; /* * The initial idmapping cannot be used to create an idmapped * mount. We use the initial idmapping as an indicator of a mount * that is not idmapped. It can simply be passed into helpers that * are aware of idmapped mounts as a convenient shortcut. A user * can just create a dedicated identity mapping to achieve the same * result. */ mnt_userns = container_of(ns, struct user_namespace, ns); if (mnt_userns == &init_user_ns) return -EPERM; /* We're not controlling the target namespace. */ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) return -EPERM; kattr->mnt_userns = get_user_ns(mnt_userns); return 0; } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, struct mount_kattr *kattr) { if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) return -EINVAL; kattr->propagation = attr->propagation; if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) return -EINVAL; kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); /* * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, * users wanting to transition to a different atime setting cannot * simply specify the atime setting in @attr_set, but must also * specify MOUNT_ATTR__ATIME in the @attr_clr field. * So ensure that MOUNT_ATTR__ATIME can't be partially set in * @attr_clr and that @attr_set can't have any atime bits set if * MOUNT_ATTR__ATIME isn't set in @attr_clr. */ if (attr->attr_clr & MOUNT_ATTR__ATIME) { if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) return -EINVAL; /* * Clear all previous time settings as they are mutually * exclusive. */ kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; switch (attr->attr_set & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_RELATIME: kattr->attr_set |= MNT_RELATIME; break; case MOUNT_ATTR_NOATIME: kattr->attr_set |= MNT_NOATIME; break; case MOUNT_ATTR_STRICTATIME: break; default: return -EINVAL; } } else { if (attr->attr_set & MOUNT_ATTR__ATIME) return -EINVAL; } return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) { if (kattr->mnt_userns) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); } static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, struct mount_kattr *kattr) { int ret; struct mount_attr attr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) return -EINVAL; if (!may_mount()) return -EPERM; ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); if (ret) return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && attr.attr_clr == 0 && attr.propagation == 0) return 0; /* Tell caller to not bother. */ ret = build_mount_kattr(&attr, usize, kattr); if (ret < 0) return ret; return 1; } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, unsigned int, flags, struct mount_attr __user *, uattr, size_t, usize) { int err; struct path target; struct mount_kattr kattr; unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; if (flags & ~(AT_EMPTY_PATH | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) return -EINVAL; if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; kattr = (struct mount_kattr) { .lookup_flags = lookup_flags, }; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; err = wants_mount_setattr(uattr, usize, &kattr); if (err <= 0) return err; err = user_path_at(dfd, path, kattr.lookup_flags, &target); if (!err) { err = do_mount_setattr(&target, &kattr); path_put(&target); } finish_mount_kattr(&kattr); return err; } SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, unsigned, flags, struct mount_attr __user *, uattr, size_t, usize) { struct file __free(fput) *file = NULL; int fd; if (!uattr && usize) return -EINVAL; file = vfs_open_tree(dfd, filename, flags); if (IS_ERR(file)) return PTR_ERR(file); if (uattr) { int ret; struct mount_kattr kattr = {}; kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); if (ret < 0) return ret; if (ret) { ret = do_mount_setattr(&file->f_path, &kattr); if (ret) return ret; finish_mount_kattr(&kattr); } } fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; fd_install(fd, no_free_ptr(file)); return fd; } int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) return root->d_sb->s_op->show_path(m, root); seq_dentry(m, root, " \t\n\\"); return 0; } static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) { struct mount *mnt = mnt_find_id_at(ns, id); if (!mnt || mnt->mnt_id_unique != id) return NULL; return &mnt->mnt; } struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; struct mnt_idmap *idmap; u64 mask; struct path root; struct seq_file seq; /* Must be last --ends in a flexible-array member. */ struct statmount sm; }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) { unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); u64 attr_flags = 0; if (mnt_flags & MNT_READONLY) attr_flags |= MOUNT_ATTR_RDONLY; if (mnt_flags & MNT_NOSUID) attr_flags |= MOUNT_ATTR_NOSUID; if (mnt_flags & MNT_NODEV) attr_flags |= MOUNT_ATTR_NODEV; if (mnt_flags & MNT_NOEXEC) attr_flags |= MOUNT_ATTR_NOEXEC; if (mnt_flags & MNT_NODIRATIME) attr_flags |= MOUNT_ATTR_NODIRATIME; if (mnt_flags & MNT_NOSYMFOLLOW) attr_flags |= MOUNT_ATTR_NOSYMFOLLOW; if (mnt_flags & MNT_NOATIME) attr_flags |= MOUNT_ATTR_NOATIME; else if (mnt_flags & MNT_RELATIME) attr_flags |= MOUNT_ATTR_RELATIME; else attr_flags |= MOUNT_ATTR_STRICTATIME; if (is_idmapped_mnt(mnt)) attr_flags |= MOUNT_ATTR_IDMAP; return attr_flags; } static u64 mnt_to_propagation_flags(struct mount *m) { u64 propagation = 0; if (IS_MNT_SHARED(m)) propagation |= MS_SHARED; if (IS_MNT_SLAVE(m)) propagation |= MS_SLAVE; if (IS_MNT_UNBINDABLE(m)) propagation |= MS_UNBINDABLE; if (!propagation) propagation |= MS_PRIVATE; return propagation; } static void statmount_sb_basic(struct kstatmount *s) { struct super_block *sb = s->mnt->mnt_sb; s->sm.mask |= STATMOUNT_SB_BASIC; s->sm.sb_dev_major = MAJOR(sb->s_dev); s->sm.sb_dev_minor = MINOR(sb->s_dev); s->sm.sb_magic = sb->s_magic; s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); } static void statmount_mnt_basic(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); s->sm.mask |= STATMOUNT_MNT_BASIC; s->sm.mnt_id = m->mnt_id_unique; s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; s->sm.mnt_id_old = m->mnt_id; s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); s->sm.mnt_propagation = mnt_to_propagation_flags(m); s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; } static void statmount_propagate_from(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); s->sm.mask |= STATMOUNT_PROPAGATE_FROM; if (IS_MNT_SLAVE(m)) s->sm.propagate_from = get_dominating_id(m, &current->fs->root); } static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) { int ret; size_t start = seq->count; ret = show_path(seq, s->mnt->mnt_root); if (ret) return ret; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; /* * Unescape the result. It would be better if supplied string was not * escaped in the first place, but that's a pretty invasive change. */ seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); return 0; } static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; err = seq_path_root(seq, &mnt_path, &s->root, ""); return err == SEQ_SKIP ? 0 : err; } static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; seq_puts(seq, sb->s_type->name); return 0; } static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; if (sb->s_subtype) seq_puts(seq, sb->s_subtype); } static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) { struct super_block *sb = s->mnt->mnt_sb; struct mount *r = real_mount(s->mnt); if (sb->s_op->show_devname) { size_t start = seq->count; int ret; ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); if (ret) return ret; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; /* Unescape the result */ seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); } else { seq_puts(seq, r->mnt_devname); } return 0; } static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; s->sm.mnt_ns_id = ns->seq; } static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; err = security_sb_show_options(seq, sb); if (err) return err; if (sb->s_op->show_options) { err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; } if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; if (seq->count == start) return 0; /* skip leading comma */ memmove(seq->buf + start, seq->buf + start + 1, seq->count - start - 1); seq->count--; return 0; } static inline int statmount_opt_process(struct seq_file *seq, size_t start) { char *buf_end, *opt_end, *src, *dst; int count = 0; if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; buf_end = seq->buf + seq->count; dst = seq->buf + start; src = dst + 1; /* skip initial comma */ if (src >= buf_end) { seq->count = start; return 0; } *buf_end = '\0'; for (; src < buf_end; src = opt_end + 1) { opt_end = strchrnul(src, ','); *opt_end = '\0'; dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1; if (WARN_ON_ONCE(++count == INT_MAX)) return -EOVERFLOW; } seq->count = dst - 1 - seq->buf; return count; } static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; if (!sb->s_op->show_options) return 0; err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; err = statmount_opt_process(seq, start); if (err < 0) return err; s->sm.opt_num = err; return 0; } static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; int err; err = security_sb_show_options(seq, sb); if (err) return err; err = statmount_opt_process(seq, start); if (err < 0) return err; s->sm.opt_sec_num = err; return 0; } static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) { int ret; ret = statmount_mnt_idmap(s->idmap, seq, true); if (ret < 0) return ret; s->sm.mnt_uidmap_num = ret; /* * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid * mappings. This allows userspace to distinguish between a * non-idmapped mount and an idmapped mount where none of the * individual mappings are valid in the caller's idmapping. */ if (is_valid_mnt_idmap(s->idmap)) s->sm.mask |= STATMOUNT_MNT_UIDMAP; return 0; } static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) { int ret; ret = statmount_mnt_idmap(s->idmap, seq, false); if (ret < 0) return ret; s->sm.mnt_gidmap_num = ret; /* * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid * mappings. This allows userspace to distinguish between a * non-idmapped mount and an idmapped mount where none of the * individual mappings are valid in the caller's idmapping. */ if (is_valid_mnt_idmap(s->idmap)) s->sm.mask |= STATMOUNT_MNT_GIDMAP; return 0; } static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; u32 start, *offp; /* Reserve an empty string at the beginning for any unset offsets */ if (!seq->count) seq_putc(seq, 0); start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: offp = &sm->fs_type; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: offp = &sm->mnt_root; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: offp = &sm->mnt_point; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: offp = &sm->mnt_opts; ret = statmount_mnt_opts(s, seq); break; case STATMOUNT_OPT_ARRAY: offp = &sm->opt_array; ret = statmount_opt_array(s, seq); break; case STATMOUNT_OPT_SEC_ARRAY: offp = &sm->opt_sec_array; ret = statmount_opt_sec_array(s, seq); break; case STATMOUNT_FS_SUBTYPE: offp = &sm->fs_subtype; statmount_fs_subtype(s, seq); break; case STATMOUNT_SB_SOURCE: offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; case STATMOUNT_MNT_UIDMAP: sm->mnt_uidmap = start; ret = statmount_mnt_uidmap(s, seq); break; case STATMOUNT_MNT_GIDMAP: sm->mnt_gidmap = start; ret = statmount_mnt_gidmap(s, seq); break; default: WARN_ON_ONCE(true); return -EINVAL; } /* * If nothing was emitted, return to avoid setting the flag * and terminating the buffer. */ if (seq->count == start) return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) return -EOVERFLOW; /* signal a retry */ if (unlikely(seq_has_overflowed(seq))) return -EAGAIN; if (ret) return ret; seq->buf[seq->count++] = '\0'; sm->mask |= flag; *offp = start; return 0; } static int copy_statmount_to_user(struct kstatmount *s) { struct statmount *sm = &s->sm; struct seq_file *seq = &s->seq; char __user *str = ((char __user *)s->buf) + sizeof(*sm); size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); if (seq->count && copy_to_user(str, seq->buf, seq->count)) return -EFAULT; /* Return the number of bytes copied to the buffer */ sm->size = copysize + seq->count; if (copy_to_user(s->buf, sm, copysize)) return -EFAULT; return 0; } static struct mount *listmnt_next(struct mount *curr, bool reverse) { struct rb_node *node; if (reverse) node = rb_prev(&curr->mnt_node); else node = rb_next(&curr->mnt_node); return node_to_mount(node); } static int grab_requested_root(struct mnt_namespace *ns, struct path *root) { struct mount *first, *child; rwsem_assert_held(&namespace_sem); /* We're looking at our own ns, just use get_fs_root. */ if (ns == current->nsproxy->mnt_ns) { get_fs_root(current->fs, root); return 0; } /* * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; for (;;) { child = listmnt_next(child, false); if (!child) return -ENOENT; if (child->mnt_parent == first) break; } root->mnt = mntget(&child->mnt); root->dentry = dget(root->mnt->mnt_root); return 0; } /* This must be updated whenever a new flag is added */ #define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ STATMOUNT_MNT_BASIC | \ STATMOUNT_PROPAGATE_FROM | \ STATMOUNT_MNT_ROOT | \ STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | \ STATMOUNT_MNT_NS_ID | \ STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | \ STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | \ STATMOUNT_OPT_SEC_ARRAY | \ STATMOUNT_SUPPORTED_MASK | \ STATMOUNT_MNT_UIDMAP | \ STATMOUNT_MNT_GIDMAP) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { struct path root __free(path_put) = {}; struct mount *m; int err; /* Has the namespace already been emptied? */ if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); if (!s->mnt) return -ENOENT; err = grab_requested_root(ns, &root); if (err) return err; /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ m = real_mount(s->mnt); if (!is_path_reachable(m, m->mnt.mnt_root, &root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; err = security_sb_statfs(s->mnt->mnt_root); if (err) return err; s->root = root; /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the * namespace semaphore and mount properties may change with only * the mount lock held. * * We could sample the mount lock sequence counter to detect * those changes and retry. But it's not worth it. Worst that * happens is that the mnt->mnt_idmap pointer is already changed * while mnt->mnt_flags isn't or vica versa. So what. * * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved * via READ_ONCE()/WRITE_ONCE() and guard against theoretical * torn read/write. That's all we care about right now. */ s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_MNT_BASIC) statmount_mnt_basic(s); if (s->mask & STATMOUNT_SB_BASIC) statmount_sb_basic(s); if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); if (s->mask & STATMOUNT_FS_TYPE) err = statmount_string(s, STATMOUNT_FS_TYPE); if (!err && s->mask & STATMOUNT_MNT_ROOT) err = statmount_string(s, STATMOUNT_MNT_ROOT); if (!err && s->mask & STATMOUNT_MNT_POINT) err = statmount_string(s, STATMOUNT_MNT_POINT); if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); if (!err && s->mask & STATMOUNT_OPT_ARRAY) err = statmount_string(s, STATMOUNT_OPT_ARRAY); if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); if (!err && s->mask & STATMOUNT_MNT_UIDMAP) err = statmount_string(s, STATMOUNT_MNT_UIDMAP); if (!err && s->mask & STATMOUNT_MNT_GIDMAP) err = statmount_string(s, STATMOUNT_MNT_GIDMAP); if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { s->sm.mask |= STATMOUNT_SUPPORTED_MASK; s->sm.supported_mask = STATMOUNT_SUPPORTED; } if (err) return err; /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); return 0; } static inline bool retry_statmount(const long ret, size_t *seq_size) { if (likely(ret != -EAGAIN)) return false; if (unlikely(check_mul_overflow(*seq_size, 2, seq_size))) return false; if (unlikely(*seq_size > MAX_RW_COUNT)) return false; return true; } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, size_t seq_size) { if (!access_ok(buf, bufsize)) return -EFAULT; memset(ks, 0, sizeof(*ks)); ks->mask = kreq->param; ks->buf = buf; ks->bufsize = bufsize; if (ks->mask & STATMOUNT_STRING_REQ) { if (bufsize == sizeof(ks->sm)) return -EOVERFLOW; ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); if (!ks->seq.buf) return -ENOMEM; ks->seq.size = seq_size; } return 0; } static int copy_mnt_id_req(const struct mnt_id_req __user *req, struct mnt_id_req *kreq) { int ret; size_t usize; BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); ret = get_user(usize, &req->size); if (ret) return -EFAULT; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MNT_ID_REQ_SIZE_VER0)) return -EINVAL; memset(kreq, 0, sizeof(*kreq)); ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; if (kreq->spare != 0) return -EINVAL; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) return -EINVAL; return 0; } /* * If the user requested a specific mount namespace id, look that up and return * that, or if not simply grab a passive reference on our mount namespace and * return that. */ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq) { struct mnt_namespace *mnt_ns; if (kreq->mnt_ns_id && kreq->spare) return ERR_PTR(-EINVAL); if (kreq->mnt_ns_id) return lookup_mnt_ns(kreq->mnt_ns_id); if (kreq->spare) { struct ns_common *ns; CLASS(fd, f)(kreq->spare); if (fd_empty(f)) return ERR_PTR(-EBADF); if (!proc_ns_file(fd_file(f))) return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); if (ns->ops->type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); } else { mnt_ns = current->nsproxy->mnt_ns; } refcount_inc(&mnt_ns->passive); return mnt_ns; } SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) { struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct kstatmount *ks __free(kfree) = NULL; struct mnt_id_req kreq; /* We currently support retrieval of 3 strings. */ size_t seq_size = 3 * PATH_MAX; int ret; if (flags) return -EINVAL; ret = copy_mnt_id_req(req, &kreq); if (ret) return ret; ns = grab_requested_mnt_ns(&kreq); if (!ns) return -ENOENT; if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); if (!ks) return -ENOMEM; retry: ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size); if (ret) return ret; scoped_guard(rwsem_read, &namespace_sem) ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, bool reverse) { struct path root __free(path_put) = {}; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); ret = grab_requested_root(ns, &root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { orig = root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) return -ENOENT; orig.dentry = orig.mnt->mnt_root; } /* * Don't trigger audit denials. We just want to determine what * mounts to show users. */ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; ret = security_sb_statfs(orig.dentry); if (ret) return ret; if (!last_mnt_id) { if (reverse) first = node_to_mount(ns->mnt_last_node); else first = node_to_mount(ns->mnt_first_node); } else { if (reverse) first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); else first = mnt_find_id_at(ns, last_mnt_id + 1); } for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) { if (r->mnt_id_unique == mnt_parent_id) continue; if (!is_path_reachable(r, r->mnt.mnt_root, &orig)) continue; *mnt_ids = r->mnt_id_unique; mnt_ids++; nr_mnt_ids--; ret++; } return ret; } SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { u64 *kmnt_ids __free(kvfree) = NULL; const size_t maxcount = 1000000; struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; u64 last_mnt_id; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) return -EINVAL; /* * If the mount namespace really has more than 1 million mounts the * caller must iterate over the mount namespace (and reconsider their * system design...). */ if (unlikely(nr_mnt_ids > maxcount)) return -EOVERFLOW; if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) return -EFAULT; ret = copy_mnt_id_req(req, &kreq); if (ret) return ret; last_mnt_id = kreq.param; /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) return -EINVAL; kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), GFP_KERNEL_ACCOUNT); if (!kmnt_ids) return -ENOMEM; ns = grab_requested_mnt_ns(&kreq); if (!ns) return -ENOENT; if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* * We only need to guard against mount topology changes as * listmount() doesn't care about any mount properties. */ scoped_guard(rwsem_read, &namespace_sem) ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; } static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; struct mnt_namespace *ns; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); ns = alloc_mnt_ns(&init_user_ns, false); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); m = real_mount(mnt); ns->root = m; ns->nr_mounts = 1; mnt_add_to_ns(ns, m); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); root.mnt = mnt; root.dentry = mnt->mnt_root; mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); mnt_ns_tree_add(ns); } void __init mnt_init(void) { int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); kernfs_init(); err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) printk(KERN_WARNING "%s: kobj create error\n", __func__); shmem_init(); init_rootfs(); init_mount_tree(); } void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; drop_collected_mounts(&ns->root->mnt); free_mnt_ns(ns); } struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until * we unmount before file sys is unregistered */ real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; } return mnt; } EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR(mnt)) { mnt_make_shortterm(mnt); synchronize_rcu(); /* yecchhh... */ mntput(mnt); } } EXPORT_SYMBOL(kern_unmount); void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) { unsigned int i; for (i = 0; i < num; i++) mnt_make_shortterm(mnt[i]); synchronize_rcu_expedited(); for (i = 0; i < num; i++) mntput(mnt[i]); } EXPORT_SYMBOL(kern_unmount_array); bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } bool current_chrooted(void) { /* Does the current process have a non-standard root */ struct path ns_root; struct path fs_root; bool chrooted; /* Find the namespace root */ ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt; ns_root.dentry = ns_root.mnt->mnt_root; path_get(&ns_root); while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) ; get_fs_root(current->fs, &fs_root); chrooted = !path_equal(&fs_root, &ns_root); path_put(&fs_root); path_put(&ns_root); return chrooted; } static bool mnt_already_visible(struct mnt_namespace *ns, const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; struct mount *mnt, *n; bool visible = false; down_read(&namespace_sem); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory * is not the root directory of the filesystem. */ if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; /* Don't miss readonly hidden in the superblock flags */ if (sb_rdonly(mnt->mnt.mnt_sb)) mnt_flags |= MNT_LOCK_READONLY; /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; /* This mount is not fully visible if there are any * locked child mounts that cover anything except for * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanently empty? */ if (!is_empty_dir_inode(inode)) goto next; } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); visible = true; goto found; next: ; } found: up_read(&namespace_sem); return visible; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; unsigned long s_iflags; if (ns->user_ns == &init_user_ns) return false; /* Can this filesystem be too revealing? */ s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) { /* * Foreign mounts (accessed via fchdir or through /proc * symlinks) are always treated as if they are nosuid. This * prevents namespaces from trusting potentially unsafe * suid/sgid bits, file caps, or security labels that originate * in other namespaces. */ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && current_in_userns(mnt->mnt_sb->s_user_ns); } static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = &nsproxy->mnt_ns->ns; get_mnt_ns(to_mnt_ns(ns)); } task_unlock(task); return ns; } static void mntns_put(struct ns_common *ns) { put_mnt_ns(to_mnt_ns(ns)); } static int mntns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(user_ns, CAP_SYS_CHROOT) || !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) return -EINVAL; if (fs->users != 1) return -EINVAL; get_mnt_ns(mnt_ns); old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, "/", LOOKUP_DOWN, &root); if (err) { /* revert to old namespace */ nsproxy->mnt_ns = old_mnt_ns; put_mnt_ns(mnt_ns); return err; } put_mnt_ns(old_mnt_ns); /* Update the pwd and root */ set_fs_pwd(fs, &root); set_fs_root(fs, &root); path_put(&root); return 0; } static struct user_namespace *mntns_owner(struct ns_common *ns) { return to_mnt_ns(ns)->user_ns; } const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, .owner = mntns_owner, }; #ifdef CONFIG_SYSCTL static const struct ctl_table fs_namespace_sysctls[] = { { .procname = "mount-max", .data = &sysctl_mount_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, }; static int __init init_fs_namespace_sysctls(void) { register_sysctl_init("fs", fs_namespace_sysctls); return 0; } fs_initcall(init_fs_namespace_sysctls); #endif /* CONFIG_SYSCTL */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 // SPDX-License-Identifier: GPL-2.0 /* XDP sockets * * AF_XDP sockets allows a channel between XDP programs and userspace * applications. * Copyright(c) 2018 Intel Corporation. * * Author(s): Björn Töpel <bjorn.topel@intel.com> * Magnus Karlsson <magnus.karlsson@intel.com> */ #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__ #include <linux/if_xdp.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/socket.h> #include <linux/file.h> #include <linux/uaccess.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include "xsk_queue.h" #include "xdp_umem.h" #include "xsk.h" #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) return; pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup |= XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_set_rx_need_wakeup); void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup |= XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_set_tx_need_wakeup); void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX)) return; pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP; pool->cached_need_wakeup &= ~XDP_WAKEUP_RX; } EXPORT_SYMBOL(xsk_clear_rx_need_wakeup); void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { struct xdp_sock *xs; if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX)) return; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); pool->cached_need_wakeup &= ~XDP_WAKEUP_TX; } EXPORT_SYMBOL(xsk_clear_tx_need_wakeup); bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return pool->uses_need_wakeup; } EXPORT_SYMBOL(xsk_uses_need_wakeup); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->real_num_rx_queues) return dev->_rx[queue_id].pool; if (queue_id < dev->real_num_tx_queues) return dev->_tx[queue_id].pool; return NULL; } EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } /* The buffer pool is stored both in the _rx struct and the _tx struct as we do * not know if the device has more tx queues than rx, or the opposite. * This might also change during run time. */ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, u16 queue_id) { if (queue_id >= max_t(unsigned int, dev->real_num_rx_queues, dev->real_num_tx_queues)) return -EINVAL; if (queue_id < dev->real_num_rx_queues) dev->_rx[queue_id].pool = pool; if (queue_id < dev->real_num_tx_queues) dev->_tx[queue_id].pool = pool; return 0; } static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, u32 flags) { u64 addr; int err; addr = xp_get_handle(xskb, xskb->pool); err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; } xp_release(xskb); return 0; } static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u32 frags = xdp_buff_has_frags(xdp); struct xdp_buff_xsk *pos, *tmp; struct list_head *xskb_list; u32 contd = 0; int err; if (frags) contd = XDP_PKT_CONTD; err = __xsk_rcv_zc(xs, xskb, len, contd); if (err) goto err; if (likely(!frags)) return 0; xskb_list = &xskb->pool->xskb_list; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { if (list_is_singular(xskb_list)) contd = 0; len = pos->xdp.data_end - pos->xdp.data; err = __xsk_rcv_zc(xs, pos, len, contd); if (err) goto err; list_del(&pos->list_node); } return 0; err: xsk_buff_free(xdp); return err; } static void *xsk_copy_xdp_start(struct xdp_buff *from) { if (unlikely(xdp_data_meta_unsupported(from))) return from->data; else return from->data_meta; } static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, u32 *from_len, skb_frag_t **frag, u32 rem) { u32 copied = 0; while (1) { u32 copy_len = min_t(u32, *from_len, to_len); memcpy(to, *from, copy_len); copied += copy_len; if (rem == copied) return copied; if (*from_len == copy_len) { *from = skb_frag_address(*frag); *from_len = skb_frag_size((*frag)++); } else { *from += copy_len; *from_len -= copy_len; } if (to_len == copy_len) return copied; to_len -= copy_len; to += copy_len; } } static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; u32 from_len, meta_len, rem, num_desc; struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; skb_frag_t *frag; from_len = xdp->data_end - copy_from; meta_len = xdp->data - copy_from; rem = len + meta_len; if (len <= frame_size && !xdp_buff_has_frags(xdp)) { int err; xsk_xdp = xsk_buff_alloc(xs->pool); if (!xsk_xdp) { xs->rx_dropped++; return -ENOMEM; } memcpy(xsk_xdp->data - meta_len, copy_from, rem); xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); err = __xsk_rcv_zc(xs, xskb, len, 0); if (err) { xsk_buff_free(xsk_xdp); return err; } return 0; } num_desc = (len - 1) / frame_size + 1; if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { xs->rx_queue_full++; return -ENOBUFS; } if (xdp_buff_has_frags(xdp)) { struct skb_shared_info *sinfo; sinfo = xdp_get_shared_info_from_buff(xdp); frag = &sinfo->frags[0]; } do { u32 to_len = frame_size + meta_len; u32 copied; xsk_xdp = xsk_buff_alloc(xs->pool); copy_to = xsk_xdp->data - meta_len; copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem); rem -= copied; xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); meta_len = 0; } while (rem); return 0; } static bool xsk_tx_writeable(struct xdp_sock *xs) { if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2) return false; return true; } static bool xsk_is_bound(struct xdp_sock *xs) { if (READ_ONCE(xs->state) == XSK_BOUND) { /* Matches smp_wmb() in bind(). */ smp_rmb(); return true; } return false; } static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { xs->rx_dropped++; return -ENOSPC; } return 0; } static void xsk_flush(struct xdp_sock *xs) { xskq_prod_submit(xs->rx); __xskq_cons_release(xs->pool->fq); sock_def_readable(&xs->sk); } int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (!err) { spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); spin_unlock_bh(&xs->pool->rx_lock); } return err; } static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { u32 len = xdp_get_buff_len(xdp); int err; err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; return xsk_rcv_zc(xs, xdp, len); } err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; } int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { int err; err = xsk_rcv(xs, xdp); if (err) return err; if (!xs->flush_node.prev) { struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list(); list_add(&xs->flush_node, flush_list); } return 0; } void __xsk_map_flush(struct list_head *flush_list) { struct xdp_sock *xs, *tmp; list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); __list_del_clearprev(&xs->flush_node); } } void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { xskq_prod_submit_n(pool->cq, nb_entries); } EXPORT_SYMBOL(xsk_tx_completed); void xsk_tx_release(struct xsk_buff_pool *pool) { struct xdp_sock *xs; rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { __xskq_cons_release(xs->tx); if (xsk_tx_writeable(xs)) xs->sk.sk_write_space(&xs->sk); } rcu_read_unlock(); } EXPORT_SYMBOL(xsk_tx_release); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { bool budget_exhausted = false; struct xdp_sock *xs; rcu_read_lock(); again: list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { budget_exhausted = true; continue; } if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { if (xskq_has_descs(xs->tx)) xskq_cons_release(xs->tx); continue; } xs->tx_budget_spent++; /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ if (xskq_prod_reserve_addr(pool->cq, desc->addr)) goto out; xskq_cons_release(xs->tx); rcu_read_unlock(); return true; } if (budget_exhausted) { list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) xs->tx_budget_spent = 0; budget_exhausted = false; goto again; } out: rcu_read_unlock(); return false; } EXPORT_SYMBOL(xsk_tx_peek_desc); static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries) { struct xdp_desc *descs = pool->tx_descs; u32 nb_pkts = 0; while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) nb_pkts++; xsk_tx_release(pool); return nb_pkts; } u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) { struct xdp_sock *xs; rcu_read_lock(); if (!list_is_singular(&pool->xsk_tx_list)) { /* Fallback to the non-batched version */ rcu_read_unlock(); return xsk_tx_peek_release_fallback(pool, nb_pkts); } xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); if (!xs) { nb_pkts = 0; goto out; } nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); /* This is the backpressure mechanism for the Tx path. Try to * reserve space in the completion queue for all packets, but * if there are fewer slots available, just process that many * packets. This avoids having to implement any buffering in * the Tx path. */ nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); if (!nb_pkts) goto out; nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); if (!nb_pkts) { xs->tx->queue_empty_descs++; goto out; } __xskq_cons_release(xs->tx); xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); xs->sk.sk_write_space(&xs->sk); out: rcu_read_unlock(); return nb_pkts; } EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) { unsigned long flags; int ret; spin_lock_irqsave(&pool->cq_lock, flags); ret = xskq_prod_reserve_addr(pool->cq, addr); spin_unlock_irqrestore(&pool->cq_lock, flags); return ret; } static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; spin_lock_irqsave(&pool->cq_lock, flags); xskq_prod_submit_n(pool->cq, n); spin_unlock_irqrestore(&pool->cq_lock, flags); } static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { unsigned long flags; spin_lock_irqsave(&pool->cq_lock, flags); xskq_prod_cancel_n(pool->cq, n); spin_unlock_irqrestore(&pool->cq_lock, flags); } static u32 xsk_get_num_desc(struct sk_buff *skb) { return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; } static void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; if (compl->tx_timestamp) { /* sw completion timestamp, not a real one */ *compl->tx_timestamp = ktime_get_tai_fast_ns(); } xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); sock_wfree(skb); } static void xsk_set_destructor_arg(struct sk_buff *skb) { long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; skb_shinfo(skb)->destructor_arg = (void *)num; } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); skb->destructor = sock_wfree; xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; } static void xsk_drop_skb(struct sk_buff *skb) { xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); xsk_consume_skb(skb); } static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); if (unlikely(!skb)) return ERR_PTR(err); skb_reserve(skb, hr); } addr = desc->addr; len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; buffer = xsk_buff_raw_get_data(pool, addr); offset = offset_in_page(buffer); addr = buffer - pool->addrs; for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { if (unlikely(i >= MAX_SKB_FRAGS)) return ERR_PTR(-EOVERFLOW); page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); copy = min_t(u32, PAGE_SIZE - offset, len - copied); skb_fill_page_desc(skb, i, page, offset, copy); copied += copy; addr += copy; offset = 0; } skb->len += len; skb->data_len += len; skb->truesize += ts; refcount_add(ts, &xs->sk.sk_wmem_alloc); return skb; } static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); goto free_err; } } else { u32 hr, tr, len; void *buffer; buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; if (!skb) { first_frag = true; hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); if (unlikely(!skb)) goto free_err; skb_reserve(skb, hr); skb_put(skb, len); err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct page *page; u8 *vaddr; if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; } page = alloc_page(xs->sk.sk_allocation); if (unlikely(!page)) { err = -EAGAIN; goto free_err; } vaddr = kmap_local_page(page); memcpy(vaddr, buffer, len); kunmap_local(vaddr); skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); } if (first_frag && desc->options & XDP_TX_METADATA) { if (unlikely(xs->pool->tx_metadata_len == 0)) { err = -EINVAL; goto free_err; } meta = buffer - xs->pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { err = -EINVAL; goto free_err; } if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { if (unlikely(meta->request.csum_start + meta->request.csum_offset + sizeof(__sum16) > len)) { err = -EINVAL; goto free_err; } skb->csum_start = hr + meta->request.csum_start; skb->csum_offset = meta->request.csum_offset; skb->ip_summed = CHECKSUM_PARTIAL; if (unlikely(xs->pool->tx_sw_csum)) { err = skb_checksum_help(skb); if (err) goto free_err; } } if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) skb->skb_mstamp_ns = meta->request.launch_time; } } skb->dev = dev; skb->priority = READ_ONCE(xs->sk.sk_priority); skb->mark = READ_ONCE(xs->sk.sk_mark); skb->destructor = xsk_destruct_skb; xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_set_destructor_arg(skb); return skb; free_err: if (first_frag && skb) kfree_skb(skb); if (err == -EOVERFLOW) { /* Drop the packet */ xsk_set_destructor_arg(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { /* Let application retry */ xsk_cq_cancel_locked(xs->pool, 1); } return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; int err = 0; mutex_lock(&xs->mutex); /* Since we dropped the RCU read lock, the socket state might have changed. */ if (unlikely(!xsk_is_bound(xs))) { err = -ENXIO; goto out; } if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; goto out; } /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); if (err) { err = -EAGAIN; goto out; } skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); if (err != -EOVERFLOW) goto out; err = 0; continue; } xskq_cons_release(xs->tx); if (xp_mb_desc(&desc)) { xs->skb = skb; continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); xsk_consume_skb(skb); err = -EAGAIN; goto out; } /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; xs->skb = NULL; goto out; } sent_frame = true; xs->skb = NULL; } if (xskq_has_descs(xs->tx)) { if (xs->skb) xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } out: if (sent_frame) if (xsk_tx_writeable(xs)) sk->sk_write_space(sk); mutex_unlock(&xs->mutex); return err; } static int xsk_generic_xmit(struct sock *sk) { int ret; /* Drop the RCU lock since the SKB path might sleep. */ rcu_read_unlock(); ret = __xsk_generic_xmit(sk); /* Reaquire RCU lock before going into common code. */ rcu_read_lock(); return ret; } static bool xsk_no_wakeup(struct sock *sk) { #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif } static int xsk_check_common(struct xdp_sock *xs) { if (unlikely(!xsk_is_bound(xs))) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; return 0; } static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(need_wait)) return -EOPNOTSUPP; if (unlikely(!xs->tx)) return -ENOBUFS; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xs->zc && xsk_no_wakeup(sk)) return 0; pool = xs->pool; if (pool->cached_need_wakeup & XDP_WAKEUP_TX) { if (xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_TX); return xsk_generic_xmit(sk); } return 0; } static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { int ret; rcu_read_lock(); ret = __xsk_sendmsg(sock, m, total_len); rcu_read_unlock(); return ret; } static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { bool need_wait = !(flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; err = xsk_check_common(xs); if (err) return err; if (unlikely(!xs->rx)) return -ENOBUFS; if (unlikely(need_wait)) return -EOPNOTSUPP; if (sk_can_busy_loop(sk)) sk_busy_loop(sk, 1); /* only support non-blocking sockets */ if (xsk_no_wakeup(sk)) return 0; if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc) return xsk_wakeup(xs, XDP_WAKEUP_RX); return 0; } static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags) { int ret; rcu_read_lock(); ret = __xsk_recvmsg(sock, m, len, flags); rcu_read_unlock(); return ret; } static __poll_t xsk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { __poll_t mask = 0; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct xsk_buff_pool *pool; sock_poll_wait(file, sock, wait); rcu_read_lock(); if (xsk_check_common(xs)) goto out; pool = xs->pool; if (pool->cached_need_wakeup) { if (xs->zc) xsk_wakeup(xs, pool->cached_need_wakeup); else if (xs->tx) /* Poll needs to drive Tx also in copy mode */ xsk_generic_xmit(sk); } if (xs->rx && !xskq_prod_is_empty(xs->rx)) mask |= EPOLLIN | EPOLLRDNORM; if (xs->tx && xsk_tx_writeable(xs)) mask |= EPOLLOUT | EPOLLWRNORM; out: rcu_read_unlock(); return mask; } static int xsk_init_queue(u32 entries, struct xsk_queue **queue, bool umem_queue) { struct xsk_queue *q; if (entries == 0 || *queue || !is_power_of_2(entries)) return -EINVAL; q = xskq_create(entries, umem_queue); if (!q) return -ENOMEM; /* Make sure queue is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(*queue, q); return 0; } static void xsk_unbind_dev(struct xdp_sock *xs) { struct net_device *dev = xs->dev; if (xs->state != XSK_BOUND) return; WRITE_ONCE(xs->state, XSK_UNBOUND); /* Wait for driver to stop using the xdp socket. */ xp_del_xsk(xs->pool, xs); synchronize_net(); dev_put(dev); } static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs, struct xdp_sock __rcu ***map_entry) { struct xsk_map *map = NULL; struct xsk_map_node *node; *map_entry = NULL; spin_lock_bh(&xs->map_list_lock); node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node, node); if (node) { bpf_map_inc(&node->map->map); map = node->map; *map_entry = node->map_entry; } spin_unlock_bh(&xs->map_list_lock); return map; } static void xsk_delete_from_maps(struct xdp_sock *xs) { /* This function removes the current XDP socket from all the * maps it resides in. We need to take extra care here, due to * the two locks involved. Each map has a lock synchronizing * updates to the entries, and each socket has a lock that * synchronizes access to the list of maps (map_list). For * deadlock avoidance the locks need to be taken in the order * "map lock"->"socket map list lock". We start off by * accessing the socket map list, and take a reference to the * map to guarantee existence between the * xsk_get_map_list_entry() and xsk_map_try_sock_delete() * calls. Then we ask the map to remove the socket, which * tries to remove the socket from the map. Note that there * might be updates to the map between * xsk_get_map_list_entry() and xsk_map_try_sock_delete(). */ struct xdp_sock __rcu **map_entry = NULL; struct xsk_map *map; while ((map = xsk_get_map_list_entry(xs, &map_entry))) { xsk_map_try_sock_delete(map, xs, map_entry); bpf_map_put(&map->map); } } static int xsk_release(struct socket *sock) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net *net; if (!sk) return 0; net = sock_net(sk); if (xs->skb) xsk_drop_skb(xs->skb); mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, sk->sk_prot, -1); xsk_delete_from_maps(xs); mutex_lock(&xs->mutex); xsk_unbind_dev(xs); mutex_unlock(&xs->mutex); xskq_destroy(xs->rx); xskq_destroy(xs->tx); xskq_destroy(xs->fq_tmp); xskq_destroy(xs->cq_tmp); sock_orphan(sk); sock->sk = NULL; sock_put(sk); return 0; } static struct socket *xsk_lookup_xsk_from_fd(int fd) { struct socket *sock; int err; sock = sockfd_lookup(fd, &err); if (!sock) return ERR_PTR(-ENOTSOCK); if (sock->sk->sk_family != PF_XDP) { sockfd_put(sock); return ERR_PTR(-ENOPROTOOPT); } return sock; } static bool xsk_validate_queues(struct xdp_sock *xs) { return xs->fq_tmp && xs->cq_tmp; } static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; int bound_dev_if; u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) return -EINVAL; if (sxdp->sxdp_family != AF_XDP) return -EINVAL; flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex) return -EINVAL; rtnl_lock(); mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { err = -EBUSY; goto out_release; } dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex); if (!dev) { err = -ENODEV; goto out_release; } netdev_lock_ops(dev); if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; } qid = sxdp->sxdp_queue_id; if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; } if (xs->umem) { /* We have already our own. */ err = -EINVAL; goto out_unlock; } sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd); if (IS_ERR(sock)) { err = PTR_ERR(sock); goto out_unlock; } umem_xs = xdp_sk(sock->sk); if (!xsk_is_bound(umem_xs)) { err = -EBADF; sockfd_put(sock); goto out_unlock; } if (umem_xs->queue_id != qid || umem_xs->dev != dev) { /* Share the umem with another socket on another qid * and/or device. */ xs->pool = xp_create_and_assign_umem(xs, umem_xs->umem); if (!xs->pool) { err = -ENOMEM; sockfd_put(sock); goto out_unlock; } err = xp_assign_dev_shared(xs->pool, umem_xs, dev, qid); if (err) { xp_destroy(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } else { /* Share the buffer pool with the other socket. */ if (xs->fq_tmp || xs->cq_tmp) { /* Do not allow setting your own fq or cq. */ err = -EINVAL; sockfd_put(sock); goto out_unlock; } xp_get_pool(umem_xs->pool); xs->pool = umem_xs->pool; /* If underlying shared umem was created without Tx * ring, allocate Tx descs array that Tx batching API * utilizes */ if (xs->tx && !xs->pool->tx_descs) { err = xp_alloc_tx_descs(xs->pool, xs); if (err) { xp_put_pool(xs->pool); xs->pool = NULL; sockfd_put(sock); goto out_unlock; } } } xdp_get_umem(umem_xs->umem); WRITE_ONCE(xs->umem, umem_xs->umem); sockfd_put(sock); } else if (!xs->umem || !xsk_validate_queues(xs)) { err = -EINVAL; goto out_unlock; } else { /* This xsk has its own umem. */ xs->pool = xp_create_and_assign_umem(xs, xs->umem); if (!xs->pool) { err = -ENOMEM; goto out_unlock; } err = xp_assign_dev(xs->pool, dev, qid, flags); if (err) { xp_destroy(xs->pool); xs->pool = NULL; goto out_unlock; } } /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ xs->fq_tmp = NULL; xs->cq_tmp = NULL; xs->dev = dev; xs->zc = xs->umem->zc; xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); if (qid < dev->real_num_rx_queues) { struct netdev_rx_queue *rxq; rxq = __netif_get_rx_queue(dev, qid); if (rxq->napi) __sk_mark_napi_id_once(sk, rxq->napi->napi_id); } out_unlock: if (err) { dev_put(dev); } else { /* Matches smp_rmb() in bind() for shared umem * sockets, and xsk_is_bound(). */ smp_wmb(); WRITE_ONCE(xs->state, XSK_BOUND); } netdev_unlock_ops(dev); out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); return err; } struct xdp_umem_reg_v1 { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ __u32 chunk_size; __u32 headroom; }; static int xsk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int err; if (level != SOL_XDP) return -ENOPROTOOPT; switch (optname) { case XDP_RX_RING: case XDP_TX_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx; err = xsk_init_queue(entries, q, false); if (!err && optname == XDP_TX_RING) /* Tx needs to be explicitly woken up the first time */ xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; mutex_unlock(&xs->mutex); return err; } case XDP_UMEM_REG: { size_t mr_size = sizeof(struct xdp_umem_reg); struct xdp_umem_reg mr = {}; struct xdp_umem *umem; if (optlen < sizeof(struct xdp_umem_reg_v1)) return -EINVAL; else if (optlen < sizeof(mr)) mr_size = sizeof(struct xdp_umem_reg_v1); BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg)); /* Make sure the last field of the struct doesn't have * uninitialized padding. All padding has to be explicit * and has to be set to zero by the userspace to make * struct xdp_umem_reg extensible in the future. */ BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) + sizeof_field(struct xdp_umem_reg, tx_metadata_len) != sizeof(struct xdp_umem_reg)); if (copy_from_sockptr(&mr, optval, mr_size)) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY || xs->umem) { mutex_unlock(&xs->mutex); return -EBUSY; } umem = xdp_umem_create(&mr); if (IS_ERR(umem)) { mutex_unlock(&xs->mutex); return PTR_ERR(umem); } /* Make sure umem is ready before it can be seen by others */ smp_wmb(); WRITE_ONCE(xs->umem, umem); mutex_unlock(&xs->mutex); return 0; } case XDP_UMEM_FILL_RING: case XDP_UMEM_COMPLETION_RING: { struct xsk_queue **q; int entries; if (optlen < sizeof(entries)) return -EINVAL; if (copy_from_sockptr(&entries, optval, sizeof(entries))) return -EFAULT; mutex_lock(&xs->mutex); if (xs->state != XSK_READY) { mutex_unlock(&xs->mutex); return -EBUSY; } q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp : &xs->cq_tmp; err = xsk_init_queue(entries, q, true); mutex_unlock(&xs->mutex); return err; } default: break; } return -ENOPROTOOPT; } static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_rxtx_ring, desc); } static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring) { ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer); ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer); ring->desc = offsetof(struct xdp_umem_ring, desc); } struct xdp_statistics_v1 { __u64 rx_dropped; __u64 rx_invalid_descs; __u64 tx_invalid_descs; }; static int xsk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); int len; if (level != SOL_XDP) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case XDP_STATISTICS: { struct xdp_statistics stats = {}; bool extra_stats = true; size_t stats_size; if (len < sizeof(struct xdp_statistics_v1)) { return -EINVAL; } else if (len < sizeof(stats)) { extra_stats = false; stats_size = sizeof(struct xdp_statistics_v1); } else { stats_size = sizeof(stats); } mutex_lock(&xs->mutex); stats.rx_dropped = xs->rx_dropped; if (extra_stats) { stats.rx_ring_full = xs->rx_queue_full; stats.rx_fill_ring_empty_descs = xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0; stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx); } else { stats.rx_dropped += xs->rx_queue_full; } stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx); stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx); mutex_unlock(&xs->mutex); if (copy_to_user(optval, &stats, stats_size)) return -EFAULT; if (put_user(stats_size, optlen)) return -EFAULT; return 0; } case XDP_MMAP_OFFSETS: { struct xdp_mmap_offsets off; struct xdp_mmap_offsets_v1 off_v1; bool flags_supported = true; void *to_copy; if (len < sizeof(off_v1)) return -EINVAL; else if (len < sizeof(off)) flags_supported = false; if (flags_supported) { /* xdp_ring_offset is identical to xdp_ring_offset_v1 * except for the flags field added to the end. */ xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.rx); xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *) &off.tx); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.fr); xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *) &off.cr); off.rx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.tx.flags = offsetof(struct xdp_rxtx_ring, ptrs.flags); off.fr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); off.cr.flags = offsetof(struct xdp_umem_ring, ptrs.flags); len = sizeof(off); to_copy = &off; } else { xsk_enter_rxtx_offsets(&off_v1.rx); xsk_enter_rxtx_offsets(&off_v1.tx); xsk_enter_umem_offsets(&off_v1.fr); xsk_enter_umem_offsets(&off_v1.cr); len = sizeof(off_v1); to_copy = &off_v1; } if (copy_to_user(optval, to_copy, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } case XDP_OPTIONS: { struct xdp_options opts = {}; if (len < sizeof(opts)) return -EINVAL; mutex_lock(&xs->mutex); if (xs->zc) opts.flags |= XDP_OPTIONS_ZEROCOPY; mutex_unlock(&xs->mutex); len = sizeof(opts); if (copy_to_user(optval, &opts, len)) return -EFAULT; if (put_user(len, optlen)) return -EFAULT; return 0; } default: break; } return -EOPNOTSUPP; } static int xsk_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; unsigned long size = vma->vm_end - vma->vm_start; struct xdp_sock *xs = xdp_sk(sock->sk); int state = READ_ONCE(xs->state); struct xsk_queue *q = NULL; if (state != XSK_READY && state != XSK_BOUND) return -EBUSY; if (offset == XDP_PGOFF_RX_RING) { q = READ_ONCE(xs->rx); } else if (offset == XDP_PGOFF_TX_RING) { q = READ_ONCE(xs->tx); } else { /* Matches the smp_wmb() in XDP_UMEM_REG */ smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) : READ_ONCE(xs->pool->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) : READ_ONCE(xs->pool->cq); } if (!q) return -EINVAL; /* Matches the smp_wmb() in xsk_init_queue */ smp_rmb(); if (size > q->ring_vmalloc_size) return -EINVAL; return remap_vmalloc_range(vma, q->ring, 0); } static int xsk_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct sock *sk; switch (msg) { case NETDEV_UNREGISTER: mutex_lock(&net->xdp.lock); sk_for_each(sk, &net->xdp.list) { struct xdp_sock *xs = xdp_sk(sk); mutex_lock(&xs->mutex); if (xs->dev == dev) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); xsk_unbind_dev(xs); /* Clear device references. */ xp_clear_dev(xs->pool); } mutex_unlock(&xs->mutex); } mutex_unlock(&net->xdp.lock); break; } return NOTIFY_DONE; } static struct proto xsk_proto = { .name = "XDP", .owner = THIS_MODULE, .obj_size = sizeof(struct xdp_sock), }; static const struct proto_ops xsk_proto_ops = { .family = PF_XDP, .owner = THIS_MODULE, .release = xsk_release, .bind = xsk_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = xsk_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = xsk_setsockopt, .getsockopt = xsk_getsockopt, .sendmsg = xsk_sendmsg, .recvmsg = xsk_recvmsg, .mmap = xsk_mmap, }; static void xsk_destruct(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); if (!sock_flag(sk, SOCK_DEAD)) return; if (!xp_put_pool(xs->pool)) xdp_put_umem(xs->umem, !xs->pool); } static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { struct xdp_sock *xs; struct sock *sk; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol) return -EPROTONOSUPPORT; sock->state = SS_UNCONNECTED; sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern); if (!sk) return -ENOBUFS; sock->ops = &xsk_proto_ops; sock_init_data(sock, sk); sk->sk_family = PF_XDP; sk->sk_destruct = xsk_destruct; sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); xs->state = XSK_READY; mutex_init(&xs->mutex); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); mutex_lock(&net->xdp.lock); sk_add_node_rcu(sk, &net->xdp.list); mutex_unlock(&net->xdp.lock); sock_prot_inuse_add(net, &xsk_proto, 1); return 0; } static const struct net_proto_family xsk_family_ops = { .family = PF_XDP, .create = xsk_create, .owner = THIS_MODULE, }; static struct notifier_block xsk_netdev_notifier = { .notifier_call = xsk_notifier, }; static int __net_init xsk_net_init(struct net *net) { mutex_init(&net->xdp.lock); INIT_HLIST_HEAD(&net->xdp.list); return 0; } static void __net_exit xsk_net_exit(struct net *net) { WARN_ON_ONCE(!hlist_empty(&net->xdp.list)); } static struct pernet_operations xsk_net_ops = { .init = xsk_net_init, .exit = xsk_net_exit, }; static int __init xsk_init(void) { int err; err = proto_register(&xsk_proto, 0 /* no slab */); if (err) goto out; err = sock_register(&xsk_family_ops); if (err) goto out_proto; err = register_pernet_subsys(&xsk_net_ops); if (err) goto out_sk; err = register_netdevice_notifier(&xsk_netdev_notifier); if (err) goto out_pernet; return 0; out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: sock_unregister(PF_XDP); out_proto: proto_unregister(&xsk_proto); out: return err; } fs_initcall(xsk_init);
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2011, Intel Corporation. * * Description: Data Center Bridging netlink interface * Author: Lucy Liu <lucy.liu@intel.com> */ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <linux/dcbnl.h> #include <net/dcbevent.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <net/sock.h> /* Data Center Bridging (DCB) is a collection of Ethernet enhancements * intended to allow network traffic with differing requirements * (highly reliable, no drops vs. best effort vs. low latency) to operate * and co-exist on Ethernet. Current DCB features are: * * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a * framework for assigning bandwidth guarantees to traffic classes. * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. * * More information about the emerging standards for these Ethernet features * can be found at: http://www.ieee802.org/1/pages/dcbridges.html * * This file implements an rtnetlink interface to allow configuration of DCB * features for capable devices. */ /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = { [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1}, [DCB_ATTR_STATE] = {.type = NLA_U8}, [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_SET_ALL] = {.type = NLA_U8}, [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG}, [DCB_ATTR_CAP] = {.type = NLA_NESTED}, [DCB_ATTR_PFC_STATE] = {.type = NLA_U8}, [DCB_ATTR_BCN] = {.type = NLA_NESTED}, [DCB_ATTR_APP] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE] = {.type = NLA_NESTED}, [DCB_ATTR_DCBX] = {.type = NLA_U8}, [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED}, }; /* DCB priority flow control to User Priority nested attributes */ static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = { [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB priority grouping nested attributes */ static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = { [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED}, [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG}, }; /* DCB traffic class nested attributes. */ static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = { [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = { [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_CAP_ATTR_PG] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC] = {.type = NLA_U8}, [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8}, [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_GSP] = {.type = NLA_U8}, [DCB_CAP_ATTR_BCN] = {.type = NLA_U8}, [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = { [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8}, [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8}, }; /* DCB BCN nested attributes. */ static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = { [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG}, [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32}, [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32}, [DCB_BCN_ATTR_BETA] = {.type = NLA_U32}, [DCB_BCN_ATTR_GD] = {.type = NLA_U32}, [DCB_BCN_ATTR_GI] = {.type = NLA_U32}, [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32}, [DCB_BCN_ATTR_TD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32}, [DCB_BCN_ATTR_W] = {.type = NLA_U32}, [DCB_BCN_ATTR_RD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RU] = {.type = NLA_U32}, [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32}, [DCB_BCN_ATTR_RI] = {.type = NLA_U32}, [DCB_BCN_ATTR_C] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB APP nested attributes. */ static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = { [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8}, [DCB_APP_ATTR_ID] = {.type = NLA_U16}, [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8}, }; /* IEEE 802.1Qaz nested attributes. */ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)}, [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8}, }; static LIST_HEAD(dcb_app_list); static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) { switch (selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: case IEEE_8021QAZ_APP_SEL_STREAM: case IEEE_8021QAZ_APP_SEL_DGRAM: case IEEE_8021QAZ_APP_SEL_ANY: case IEEE_8021QAZ_APP_SEL_DSCP: return DCB_ATTR_IEEE_APP; case DCB_APP_SEL_PCP: return DCB_ATTR_DCB_APP; default: return DCB_ATTR_IEEE_APP_UNSPEC; } } static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) { switch (type) { case DCB_ATTR_IEEE_APP: case DCB_ATTR_DCB_APP: return true; default: return false; } } static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) { return dcbnl_app_attr_type_get(selector) == type; } static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { struct sk_buff *skb; struct dcbmsg *dcb; struct nlmsghdr *nlh; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags); BUG_ON(!nlh); dcb = nlmsg_data(nlh); dcb->dcb_family = AF_UNSPEC; dcb->cmd = cmd; dcb->dcb_pad = 0; if (nlhp) *nlhp = nlh; return skb; } static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */ if (!netdev->dcbnl_ops->getstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->getstate(netdev)); } static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG); if (!nest) return -EMSGSIZE; if (data[DCB_PFC_UP_ATTR_ALL]) getall = 1; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (!getall && !data[i]) continue; netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 perm_addr[MAX_ADDR_LEN]; if (!netdev->dcbnl_ops->getpermhwaddr) return -EOPNOTSUPP; memset(perm_addr, 0, sizeof(perm_addr)); netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr); } static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_CAP]) return -EINVAL; if (!netdev->dcbnl_ops->getcap) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], dcbnl_cap_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP); if (!nest) return -EMSGSIZE; if (data[DCB_CAP_ATTR_ALL]) getall = 1; for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) { if (!getall && !data[i]) continue; if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->getnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS); if (!nest) return -EMSGSIZE; if (data[DCB_NUMTCS_ATTR_ALL]) getall = 1; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value); if (!ret) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } else return -EINVAL; } nla_nest_end(skb, nest); return 0; } static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1]; int ret; u8 value; int i; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->setnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value); if (ret) break; } return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret); } static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getpfcstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_PFC_STATE, netdev->dcbnl_ops->getpfcstate(netdev)); } static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_PFC_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setpfcstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]); netdev->dcbnl_ops->setpfcstate(netdev, value); return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0); } static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *app_nest; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; u16 id; u8 up, idtype; int ret; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); if (ret < 0) return ret; else up = ret; } else { struct dcb_app app = { .selector = idtype, .protocol = id, }; up = dcb_getapp(netdev, &app); } app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) return -EMSGSIZE; ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype); if (ret) goto out_cancel; ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id); if (ret) goto out_cancel; ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up); if (ret) goto out_cancel; nla_nest_end(skb, app_nest); return 0; out_cancel: nla_nest_cancel(skb, app_nest); return ret; } static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; u16 id; u8 up, idtype; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID]) || (!app_tb[DCB_APP_ATTR_PRIORITY])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]); if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); if (ret < 0) return ret; } else { struct dcb_app app; app.selector = idtype; app.protocol = id; app.priority = up; ret = dcb_setapp(netdev, &app); } ret = nla_put_u8(skb, DCB_ATTR_APP, ret); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0); return ret; } static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_nest, *param_nest, *data; struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; u8 prio, pgid, tc_pct, up_map; int ret; int getall = 0; int i; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpgtccfgtx || !netdev->dcbnl_ops->getpgtccfgrx || !netdev->dcbnl_ops->getpgbwgcfgtx || !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG); if (!pg_nest) return -EMSGSIZE; if (pg_tb[DCB_PG_ATTR_TC_ALL]) getall = 1; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!getall && !pg_tb[i]) continue; if (pg_tb[DCB_PG_ATTR_TC_ALL]) data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, data, dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; param_nest = nla_nest_start_noflag(skb, i); if (!param_nest) goto err_pg; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } else { /* Tx */ netdev->dcbnl_ops->getpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } if (param_tb[DCB_TC_ATTR_PARAM_PGID] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct); if (ret) goto err_param; } nla_nest_end(skb, param_nest); } if (pg_tb[DCB_PG_ATTR_BW_ID_ALL]) getall = 1; else getall = 0; for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!getall && !pg_tb[i]) continue; tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } else { /* Tx */ netdev->dcbnl_ops->getpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } ret = nla_put_u8(skb, i, tc_pct); if (ret) goto err_pg; } nla_nest_end(skb, pg_nest); return 0; err_param: nla_nest_cancel(skb, param_nest); err_pg: nla_nest_cancel(skb, pg_nest); return -EMSGSIZE; } static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0); } static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1); } static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_STATE]); return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->setstate(netdev, value)); } static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1]; int i; int ret; u8 value; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); netdev->dcbnl_ops->setpfccfg(netdev, data[i]->nla_type - DCB_PFC_UP_ATTR_0, value); } return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0); } static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; if (!tb[DCB_ATTR_SET_ALL]) return -EINVAL; if (!netdev->dcbnl_ops->setall) return -EOPNOTSUPP; ret = nla_put_u8(skb, DCB_ATTR_SET_ALL, netdev->dcbnl_ops->setall(netdev)); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0); return ret; } static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; int ret; int i; u8 pgid; u8 up_map; u8 prio; u8 tc_pct; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpgtccfgtx || !netdev->dcbnl_ops->setpgtccfgrx || !netdev->dcbnl_ops->setpgbwgcfgtx || !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!pg_tb[i]) continue; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, pg_tb[i], dcbnl_tc_param_nest, NULL); if (ret) return ret; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]) prio = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]); if (param_tb[DCB_TC_ATTR_PARAM_PGID]) pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]); if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT]) tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]); if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]) up_map = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } else { /* Tx */ netdev->dcbnl_ops->setpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!pg_tb[i]) continue; tc_pct = nla_get_u8(pg_tb[i]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } else { /* Tx */ netdev->dcbnl_ops->setpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } } return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0); } static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0); } static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1); } static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *bcn_nest; struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1]; u8 value_byte; u32 value_integer; int ret; bool getall = false; int i; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->getbcnrp || !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN); if (!bcn_nest) return -EMSGSIZE; if (bcn_tb[DCB_BCN_ATTR_ALL]) getall = true; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0, &value_byte); ret = nla_put_u8(skb, i, value_byte); if (ret) goto err_bcn; } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcncfg(netdev, i, &value_integer); ret = nla_put_u32(skb, i, value_integer); if (ret) goto err_bcn; } nla_nest_end(skb, bcn_nest); return 0; err_bcn: nla_nest_cancel(skb, bcn_nest); return ret; } static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_BCN_ATTR_MAX + 1]; int i; int ret; u8 value_byte; u32 value_int; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->setbcncfg || !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (data[i] == NULL) continue; value_byte = nla_get_u8(data[i]); netdev->dcbnl_ops->setbcnrp(netdev, data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte); } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (data[i] == NULL) continue; value_int = nla_get_u32(data[i]); netdev->dcbnl_ops->setbcncfg(netdev, i, value_int); } return nla_put_u8(skb, DCB_ATTR_BCN, 0); } static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, int app_nested_type, int app_info_type, int app_entry_type) { struct dcb_peer_app_info info; struct dcb_app *table = NULL; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; u16 app_count; int err; /** * retrieve the peer app configuration form the driver. If the driver * handlers fail exit without doing anything */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { table = kmalloc_array(app_count, sizeof(struct dcb_app), GFP_KERNEL); if (!table) return -ENOMEM; err = ops->peer_getapptable(netdev, table); } if (!err) { u16 i; struct nlattr *app; /** * build the message, from here on the only possible failure * is due to the skb size */ err = -EMSGSIZE; app = nla_nest_start_noflag(skb, app_nested_type); if (!app) goto nla_put_failure; if (app_info_type && nla_put(skb, app_info_type, sizeof(info), &info)) goto nla_put_failure; for (i = 0; i < app_count; i++) { if (nla_put(skb, app_entry_type, sizeof(struct dcb_app), &table[i])) goto nla_put_failure; } nla_nest_end(skb, app); } err = 0; nla_put_failure: kfree(table); return err; } static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; enum ieee_attrs_app type; struct nlattr *apptrust; int nselectors, err, i; u8 *selectors; selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL); if (!selectors) return -ENOMEM; err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); if (err) { err = 0; goto out; } apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); if (!apptrust) { err = -EMSGSIZE; goto out; } for (i = 0; i < nselectors; i++) { type = dcbnl_app_attr_type_get(selectors[i]); err = nla_put_u8(skb, type, selectors[i]); if (err) { nla_nest_cancel(skb, apptrust); goto out; } } nla_nest_end(skb, apptrust); out: kfree(selectors); return err; } /* Set or delete APP table or rewrite table entries. The APP struct is validated * and the appropriate callback function is called. */ static int dcbnl_app_table_setdel(struct nlattr *attr, struct net_device *netdev, int (*setdel)(struct net_device *dev, struct dcb_app *app)) { struct dcb_app *app_data; enum ieee_attrs_app type; struct nlattr *attr_itr; int rem, err; nla_for_each_nested(attr_itr, attr, rem) { type = nla_type(attr_itr); if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr_itr) < sizeof(struct dcb_app)) return -ERANGE; app_data = nla_data(attr_itr); if (!dcbnl_app_selector_validate(type, app_data->selector)) return -EINVAL; err = setdel(netdev, app_data); if (err) return err; } return 0; } /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) return -EMSGSIZE; ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE); if (!ieee) return -EMSGSIZE; if (ops->ieee_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_getmaxrate) { struct ieee_maxrate maxrate; memset(&maxrate, 0, sizeof(maxrate)); err = ops->ieee_getmaxrate(netdev, &maxrate); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE, sizeof(maxrate), &maxrate); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcn) { struct ieee_qcn qcn; memset(&qcn, 0, sizeof(qcn)); err = ops->ieee_getqcn(netdev, &qcn); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN, sizeof(qcn), &qcn); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcnstats) { struct ieee_qcn_stats qcn_stats; memset(&qcn_stats, 0, sizeof(qcn_stats)); err = ops->ieee_getqcnstats(netdev, &qcn_stats); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, sizeof(qcn_stats), &qcn_stats); if (err) return -EMSGSIZE; } } if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->dcbnl_getbuffer) { struct dcbnl_buffer buffer; memset(&buffer, 0, sizeof(buffer)); err = ops->dcbnl_getbuffer(netdev, &buffer); if (!err && nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer)) return -EMSGSIZE; } app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE); if (!app) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } } if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); if (!rewr) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); nla_nest_cancel(skb, rewr); return -EMSGSIZE; } } } spin_unlock_bh(&dcb_lock); nla_nest_end(skb, rewr); if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) return err; } /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_peer_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_peer_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP); if (err) return -EMSGSIZE; } nla_nest_end(skb, ieee); if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) return -EMSGSIZE; } return 0; } static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev, int dir) { u8 pgid, up_map, prio, tc_pct; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG; struct nlattr *pg = nla_nest_start_noflag(skb, i); if (!pg) return -EMSGSIZE; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { struct nlattr *tc_nest = nla_nest_start_noflag(skb, i); if (!tc_nest) return -EMSGSIZE; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); else ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct)) return -EMSGSIZE; nla_nest_end(skb, tc_nest); } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); else ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); if (nla_put_u8(skb, i, tc_pct)) return -EMSGSIZE; } nla_nest_end(skb, pg); return 0; } static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *cee, *app; struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; int dcbx, i, err = -EMSGSIZE; u8 value; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) goto nla_put_failure; cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE); if (!cee) goto nla_put_failure; /* local pg */ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) { err = dcbnl_cee_pg_fill(skb, netdev, 1); if (err) goto nla_put_failure; } if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) { err = dcbnl_cee_pg_fill(skb, netdev, 0); if (err) goto nla_put_failure; } /* local pfc */ if (ops->getpfccfg) { struct nlattr *pfc_nest = nla_nest_start_noflag(skb, DCB_ATTR_CEE_PFC); if (!pfc_nest) goto nla_put_failure; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); if (nla_put_u8(skb, i, value)) goto nla_put_failure; } nla_nest_end(skb, pfc_nest); } /* local app */ spin_lock_bh(&dcb_lock); app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { struct nlattr *app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, itr->app.selector); if (err) goto dcb_unlock; err = nla_put_u16(skb, DCB_APP_ATTR_ID, itr->app.protocol); if (err) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, itr->app.priority); if (err) goto dcb_unlock; nla_nest_end(skb, app_nest); } } nla_nest_end(skb, app); if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { struct nlattr *feat = nla_nest_start_noflag(skb, DCB_ATTR_CEE_FEAT); if (!feat) goto nla_put_failure; for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX; i++) if (!ops->getfeatcfg(netdev, i, &value) && nla_put_u8(skb, i, value)) goto nla_put_failure; nla_nest_end(skb, feat); } /* peer info if available */ if (ops->cee_peer_getpg) { struct cee_pg pg; memset(&pg, 0, sizeof(pg)); err = ops->cee_peer_getpg(netdev, &pg); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg)) goto nla_put_failure; } if (ops->cee_peer_getpfc) { struct cee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->cee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc)) goto nla_put_failure; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_CEE_PEER_APP_TABLE, DCB_ATTR_CEE_PEER_APP_INFO, DCB_ATTR_CEE_PEER_APP); if (err) goto nla_put_failure; } nla_nest_end(skb, cee); /* DCBX state */ if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) goto nla_put_failure; } return 0; dcb_unlock: spin_unlock_bh(&dcb_lock); nla_put_failure: err = -EMSGSIZE; return err; } static int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid, int dcbx_ver) { struct net *net = dev_net(dev); struct sk_buff *skb; struct nlmsghdr *nlh; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int err; if (!ops) return -EOPNOTSUPP; skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); else err = dcbnl_cee_fill(skb, dev); if (err < 0) { /* Report error to broadcast listeners */ nlmsg_free(skb); rtnl_set_sk_err(net, RTNLGRP_DCB, err); } else { /* End nlmsg and notify broadcast listeners */ nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL); } return err; } int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE); } EXPORT_SYMBOL(dcbnl_ieee_notify); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE); } EXPORT_SYMBOL(dcbnl_cee_notify); /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. * If any requested operation can not be completed * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int prio; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) { struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]); err = ops->ieee_setets(netdev, ets); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) { struct ieee_maxrate *maxrate = nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]); err = ops->ieee_setmaxrate(netdev, maxrate); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { struct ieee_qcn *qcn = nla_data(ieee[DCB_ATTR_IEEE_QCN]); err = ops->ieee_setqcn(netdev, qcn); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); if (err) goto err; } if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) { struct dcbnl_buffer *buffer = nla_data(ieee[DCB_ATTR_DCB_BUFFER]); for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) { if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) { err = -EINVAL; goto err; } } err = ops->dcbnl_setbuffer(netdev, buffer); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_setrewr ?: dcb_setrewr); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_setapp ?: dcb_ieee_setapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; struct nlattr *attr; int nselectors = 0; int rem; if (!ops->dcbnl_setapptrust) { err = -EOPNOTSUPP; goto err; } nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], rem) { enum ieee_attrs_app type = nla_type(attr); u8 selector; int i; if (!dcbnl_app_attr_type_validate(type) || nla_len(attr) != 1 || nselectors >= sizeof(selectors)) { err = -EINVAL; goto err; } selector = nla_get_u8(attr); if (!dcbnl_app_selector_validate(type, selector)) { err = -EINVAL; goto err; } /* Duplicate selector ? */ for (i = 0; i < nselectors; i++) { if (selectors[i] == selector) { err = -EINVAL; goto err; } } selectors[nselectors++] = selector; } err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); return err; } static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_ieee_fill(skb, netdev); } static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_delapp ?: dcb_ieee_delapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_delrewr ?: dcb_delrewr); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0); return err; } /* DCBX configuration */ static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getdcbx) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->getdcbx(netdev)); } static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!netdev->dcbnl_ops->setdcbx) return -EOPNOTSUPP; if (!tb[DCB_ATTR_DCBX]) return -EINVAL; value = nla_get_u8(tb[DCB_ATTR_DCBX]); return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->setdcbx(netdev, value)); } static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest; u8 value; int ret, i; int getall = 0; if (!netdev->dcbnl_ops->getfeatcfg) return -EOPNOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG); if (!nest) return -EMSGSIZE; if (data[DCB_FEATCFG_ATTR_ALL]) getall = 1; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value); if (!ret) ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); goto nla_put_failure; } } nla_nest_end(skb, nest); nla_put_failure: return ret; } static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1]; int ret, i; u8 value; if (!netdev->dcbnl_ops->setfeatcfg) return -ENOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) goto err; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value); if (ret) goto err; } err: ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret); return ret; } /* Handle CEE DCBX GET commands. */ static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_cee_fill(skb, netdev); } struct reply_func { /* reply netlink message type */ int type; /* function to fill message contents */ int (*cb)(struct net_device *, struct nlmsghdr *, u32, struct nlattr **, struct sk_buff *); }; static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate }, [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate }, [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg }, [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg }, [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr }, [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap }, [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs }, [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs }, [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate }, [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate }, [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp }, [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp }, [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg }, [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg }, [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg }, [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg }, [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall }, [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg }, [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg }, [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get }, [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set }, [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del }, [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx }, [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx }, [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg }, [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg }, [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, dcbnl_rtnl_policy, extack); if (ret < 0) return ret; if (dcb->cmd > DCB_CMD_MAX) return -EINVAL; /* check if a reply function has been defined for the command */ fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); if (!netdev) return -ENODEV; if (!netdev->dcbnl_ops) return -EOPNOTSUPP; reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { nlmsg_free(reply_skb); goto out; } nlmsg_end(reply_skb, reply_nlh); ret = rtnl_unicast(reply_skb, net, portid); out: return ret; } static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, int ifindex, int proto) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->app.selector == app->selector && itr->app.priority == app->priority && itr->ifindex == ifindex && ((proto == -1) || itr->app.protocol == proto)) return itr; } return NULL; } static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && ((prio == -1) || itr->app.priority == prio)) return itr; } return NULL; } static int dcb_app_add(struct list_head *list, const struct dcb_app *app, int ifindex) { struct dcb_app_type *entry; entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; list_add(&entry->list, list); return 0; } /** * dcb_getapp - retrieve the DCBX application user priority * @dev: network interface * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to * indicate an error. */ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list * @dev: network interface * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type *itr; struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ itr = dcb_app_lookup(new, dev->ifindex, -1); if (itr) { if (new->priority) itr->app.priority = new->priority; else { list_del(&itr->list); kfree(itr); } goto out; } /* App type does not exist add new application type */ if (new->priority) err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority * @dev: network interface * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was * not found in APP list. */ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_ieee_getapp_mask); /* Get protocol value from rewrite entry. */ u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u16 proto = 0; spin_lock_bh(&dcb_lock); itr = dcb_rewr_lookup(app, dev->ifindex, -1); if (itr) proto = itr->app.protocol; spin_unlock_bh(&dcb_lock); return proto; } EXPORT_SYMBOL(dcb_getrewr); /* Add rewrite entry to the rewrite list. */ int dcb_setrewr(struct net_device *dev, struct dcb_app *new) { int err; spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found. */ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_setrewr); /* Delete rewrite entry from the rewrite list. */ int dcb_delrewr(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; int err = -ENOENT; spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); if (itr) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_delrewr); /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long * as the priorities are different. Priority is expected to be a * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list * @dev: network interface * @del: application data to delete * * This removes a matching APP data from the APP list */ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; struct dcb_app_type event; int err = -ENOENT; event.ifindex = dev->ifindex; memcpy(&event.app, del, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_delapp); /* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from * priorities to the PCP and DEI values assigned to that priority. */ void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, struct dcb_rewr_prio_pcp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == DCB_APP_SEL_PCP && itr->app.protocol < 16 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1 << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); /* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. */ void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for * that priority by APP entries. */ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); /* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a * given DSCP value by APP entries. */ void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, struct dcb_ieee_app_dscp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) p_map->map[itr->app.protocol] |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); /* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not * otherwise specified". * * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default * priorities set by these entries. */ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 mask = 0; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && itr->app.protocol == 0 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) mask |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); return mask; } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { list_del(&itr->list); kfree(itr); } } spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UNREGISTER: if (!dev->dcbnl_ops) return NOTIFY_DONE; dcbnl_flush_dev(dev); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block dcbnl_nb __read_mostly = { .notifier_call = dcbnl_netdevice_event, }; static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_GETDCB, .doit = dcb_doit}, {.msgtype = RTM_SETDCB, .doit = dcb_doit}, }; static int __init dcbnl_init(void) { int err; err = register_netdevice_notifier(&dcbnl_nb); if (err) return err; rtnl_register_many(dcbnl_rtnl_msg_handlers); return 0; } device_initcall(dcbnl_init);
33 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/rculist_nulls.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rcuref.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ union { struct { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; u32 nr_extents; }; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif #ifdef CONFIG_FANOTIFY UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; enum rlimit_type { UCOUNT_RLIMIT_NPROC, UCOUNT_RLIMIT_MSGQUEUE, UCOUNT_RLIMIT_SIGPENDING, UCOUNT_RLIMIT_MEMLOCK, UCOUNT_RLIMIT_COUNTS, }; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc; #endif struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; long ucount_max[UCOUNT_COUNTS]; long rlimit_max[UCOUNT_RLIMIT_COUNTS]; #if IS_ENABLED(CONFIG_BINFMT_MISC) struct binfmt_misc *binfmt_misc; #endif } __randomize_layout; struct ucounts { struct hlist_nulls_node node; struct user_namespace *ns; kuid_t uid; struct rcu_head rcu; rcuref_t count; atomic_long_t ucount[UCOUNT_COUNTS]; atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS]; }; extern struct user_namespace init_user_ns; extern struct ucounts init_ucounts; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid); void put_ucounts(struct ucounts *ucounts); static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts) { if (rcuref_get(&ucounts->count)) return ucounts; return NULL; } static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type) { return atomic_long_read(&ucounts->rlimit[type]); } long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type) { return READ_ONCE(ns->rlimit_max[type]); } static inline void set_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type, unsigned long max) { ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) refcount_inc(&ns->ns.count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && refcount_dec_and_test(&ns->ns.count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ #define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_ /** * __ffs - find first bit in word. * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ static __always_inline unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } #endif
32 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu-vm.c - vmalloc area based chunk allocation * * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * * Chunks are mapped into vmalloc areas and populated page by page. * This is the default chunk allocator. */ #include "internal.h" static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { /* must not be used on pre-mapped chunk */ WARN_ON(chunk->immutable); return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /** * pcpu_get_pages - get temp pages array * * Returns pointer to array of pointers to struct page which can be indexed * with pcpu_page_idx(). Note that there is only one array and accesses * should be serialized by pcpu_alloc_mutex. * * RETURNS: * Pointer to temp pages array on success. */ static struct page **pcpu_get_pages(void) { static struct page **pages; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); return pages; } /** * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * * Free pages [@page_start and @page_end) in @pages for all units. * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page = pages[pcpu_page_idx(cpu, i)]; if (page) __free_page(page); } } } /** * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @gfp: allocation flags passed to the underlying allocator * * Allocate pages [@page_start,@page_end) into @pages for all units. * The allocation is for @chunk. Percpu core doesn't care about the * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end, gfp_t gfp) { unsigned int cpu, tcpu; int i; gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); if (!*pagep) goto err; } } return 0; err: while (--i >= page_start) __free_page(pages[pcpu_page_idx(cpu, i)]); for_each_possible_cpu(tcpu) { if (tcpu == cpu) break; for (i = page_start; i < page_end; i++) __free_page(pages[pcpu_page_idx(tcpu, i)]); } return -ENOMEM; } /** * pcpu_pre_unmap_flush - flush cache prior to unmapping * @chunk: chunk the regions to be flushed belongs to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages in [@page_start,@page_end) of @chunk are about to be * unmapped. Flush cache. As each flushing trial can be very * expensive, issue flush on the whole region at once rather than * doing it for each cpu. This could be an overkill but is more * scalable. */ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) { vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); } /** * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. * Corresponding elements in @pages were cleared by the caller and can * be used to carry information to pcpu_free_pages() which will be * called after all unmaps are finished. The caller should call * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page *page; page = pcpu_chunk_page(chunk, cpu, i); WARN_ON(!page); pages[pcpu_page_idx(cpu, i)] = page; } __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } } /** * pcpu_post_unmap_tlb_flush - flush TLB after unmapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush * TLB for the regions. This can be skipped if the area is to be * returned to vmalloc as vmalloc will handle TLB flushing lazily. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, int nr_pages) { return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), PAGE_KERNEL, pages, PAGE_SHIFT); } /** * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * * For each cpu, map pages [@page_start,@page_end) into @chunk. The * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * * This function is responsible for setting up whatever is necessary for * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), &pages[pcpu_page_idx(cpu, page_start)], page_end - page_start); if (err < 0) goto err; for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); } return 0; err: for_each_possible_cpu(tcpu) { __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), page_end - page_start); if (tcpu == cpu) break; } pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); return err; } /** * pcpu_post_map_flush - flush cache after mapping * @chunk: pcpu_chunk the regions to be flushed belong to * @page_start: page index of the first page to be flushed * @page_end: page index of the last page to be flushed + 1 * * Pages [@page_start,@page_end) of @chunk have been mapped. Flush * cache. * * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once * for the whole region. */ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest * @page_start: the start page * @page_end: the end page * @gfp: allocation flags passed to the underlying memory allocator * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp) { struct page **pages; pages = pcpu_get_pages(); if (!pages) return -ENOMEM; if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) return -ENOMEM; if (pcpu_map_pages(chunk, pages, page_start, page_end)) { pcpu_free_pages(chunk, pages, page_start, page_end); return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); return 0; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate * @page_start: the start page * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. * * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the * region back to vmalloc() which will lazily flush the tlb. * * CONTEXT: * pcpu_alloc_mutex. */ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end) { struct page **pages; /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ pages = pcpu_get_pages(); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); pcpu_unmap_pages(chunk, pages, page_start, page_end); pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; } chunk->data = vms; chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(chunk->base_addr); return chunk; } static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; pcpu_stats_chunk_dealloc(); trace_percpu_destroy_chunk(chunk->base_addr); if (chunk->data) pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); pcpu_free_chunk(chunk); } static struct page *pcpu_addr_to_page(void *addr) { return vmalloc_to_page(addr); } static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) { /* no extra restriction */ return 0; } /** * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim * @chunk: chunk of interest * * This is the entry point for percpu reclaim. If a chunk qualifies, it is then * isolated and managed in separate lists at the back of pcpu_slot: sidelined * and to_depopulate respectively. The to_depopulate list holds chunks slated * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once * they are on this list. Once depopulated, they are moved onto the sidelined * list which enables them to be pulled back in for allocation if no other chunk * can suffice the allocation. */ static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) { /* do not reclaim either the first chunk or reserved chunk */ if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) return false; /* * If it is isolated, it may be on the sidelined list so move it back to * the to_depopulate list. If we hit at least 1/4 pages empty pages AND * there is no system-wide shortage of empty pages aside from this * chunk, move it to the to_depopulate list. */ return ((chunk->isolated && chunk->nr_empty_pop_pages) || (pcpu_nr_empty_pop_pages > (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); }
166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* Copy ready events to userspace */ int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(!READ_ONCE(file->f_ep))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_release(struct file *file) {} #endif #if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT) /* ARM OABI has an incompatible struct layout and needs a special handler */ extern struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent); #else static inline struct epoll_event __user * epoll_put_uevent(__poll_t revents, __u64 data, struct epoll_event __user *uevent) { if (__put_user(revents, &uevent->events) || __put_user(data, &uevent->data)) return NULL; return uevent+1; } #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */
97 97 97 64 63 53 53 53 8 8 8 27 28 17 17 24 24 24 7 15 2 24 3 19 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 // SPDX-License-Identifier: GPL-2.0-only /* * arch/arm64/mm/hugetlbpage.c * * Copyright (C) 2013 Linaro Ltd. * * Based on arch/x86/mm/hugetlbpage.c. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> /* * HugeTLB Support Matrix * * --------------------------------------------------- * | Page Size | CONT PTE | PMD | CONT PMD | PUD | * --------------------------------------------------- * | 4K | 64K | 2M | 32M | 1G | * | 16K | 2M | 32M | 1G | | * | 64K | 2M | 512M | 16G | | * --------------------------------------------------- */ /* * Reserve CMA areas for the largest supported gigantic * huge page when requested. Any other smaller gigantic * huge pages could still be served from those areas. */ #ifdef CONFIG_CMA void __init arm64_hugetlb_cma_reserve(void) { int order; if (pud_sect_supported()) order = PUD_SHIFT - PAGE_SHIFT; else order = CONT_PMD_SHIFT - PAGE_SHIFT; hugetlb_cma_reserve(order); } #endif /* CONFIG_CMA */ static bool __hugetlb_valid_size(unsigned long size) { switch (size) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: return pud_sect_supported(); #endif case CONT_PMD_SIZE: case PMD_SIZE: case CONT_PTE_SIZE: return true; } return false; } #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION bool arch_hugetlb_migration_supported(struct hstate *h) { size_t pagesize = huge_page_size(h); if (!__hugetlb_valid_size(pagesize)) { pr_warn("%s: unrecognized huge page size 0x%lx\n", __func__, pagesize); return false; } return true; } #endif static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { pgd_t *pgdp = pgd_offset(mm, addr); p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; *pgsize = PAGE_SIZE; p4dp = p4d_offset(pgdp, addr); pudp = pud_offset(p4dp, addr); pmdp = pmd_offset(pudp, addr); if ((pte_t *)pmdp == ptep) { *pgsize = PMD_SIZE; return CONT_PMDS; } return CONT_PTES; } static inline int num_contig_ptes(unsigned long size, size_t *pgsize) { int contig_ptes = 1; *pgsize = size; switch (size) { case CONT_PMD_SIZE: *pgsize = PMD_SIZE; contig_ptes = CONT_PMDS; break; case CONT_PTE_SIZE: *pgsize = PAGE_SIZE; contig_ptes = CONT_PTES; break; default: WARN_ON(!__hugetlb_valid_size(size)); } return contig_ptes; } pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { int ncontig, i; size_t pgsize; pte_t orig_pte = __ptep_get(ptep); if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; ncontig = find_num_contig(mm, addr, ptep, &pgsize); for (i = 0; i < ncontig; i++, ptep++) { pte_t pte = __ptep_get(ptep); if (pte_dirty(pte)) orig_pte = pte_mkdirty(orig_pte); if (pte_young(pte)) orig_pte = pte_mkyoung(orig_pte); } return orig_pte; } /* * Changing some bits of contiguous entries requires us to follow a * Break-Before-Make approach, breaking the whole contiguous set * before we can change any entries. See ARM DDI 0487A.k_iss10775, * "Misprogramming of the Contiguous bit", page D4-1762. * * This helper performs the break step. */ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pgsize, unsigned long ncontig) { pte_t pte, tmp_pte; bool present; pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); present = pte_present(pte); while (--ncontig) { ptep++; tmp_pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); if (present) { if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); if (pte_young(tmp_pte)) pte = pte_mkyoung(pte); } } return pte; } static pte_t get_clear_contig_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pgsize, unsigned long ncontig) { pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig); struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); unsigned long end = addr + (pgsize * ncontig); __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true); return orig_pte; } /* * Changing some bits of contiguous entries requires us to follow a * Break-Before-Make approach, breaking the whole contiguous set * before we can change any entries. See ARM DDI 0487A.k_iss10775, * "Misprogramming of the Contiguous bit", page D4-1762. * * This helper performs the break step for use cases where the * original pte is not needed. */ static void clear_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pgsize, unsigned long ncontig) { struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) __ptep_get_and_clear_anysz(mm, ptep, pgsize); if (mm == &init_mm) flush_tlb_kernel_range(saddr, addr); else __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true); } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz) { size_t pgsize; int i; int ncontig; ncontig = num_contig_ptes(sz, &pgsize); if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) __set_ptes_anysz(mm, ptep, pte, 1, pgsize); return; } /* Only need to "break" if transitioning valid -> valid. */ if (pte_cont(pte) && pte_valid(__ptep_get(ptep))) clear_flush(mm, addr, ptep, pgsize, ncontig); __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; p4d_t *p4dp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep = NULL; pgdp = pgd_offset(mm, addr); p4dp = p4d_alloc(mm, pgdp, addr); if (!p4dp) return NULL; pudp = pud_alloc(mm, p4dp, addr); if (!pudp) return NULL; if (sz == PUD_SIZE) { ptep = (pte_t *)pudp; } else if (sz == (CONT_PTE_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); if (!pmdp) return NULL; WARN_ON(addr & (sz - 1)); ptep = pte_alloc_huge(mm, pmdp, addr); } else if (sz == PMD_SIZE) { if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp))) ptep = huge_pmd_share(mm, vma, addr, pudp); else ptep = (pte_t *)pmd_alloc(mm, pudp, addr); } else if (sz == (CONT_PMD_SIZE)) { pmdp = pmd_alloc(mm, pudp, addr); WARN_ON(addr & (sz - 1)); return (pte_t *)pmdp; } return ptep; } pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgdp; p4d_t *p4dp; pud_t *pudp, pud; pmd_t *pmdp, pmd; pgdp = pgd_offset(mm, addr); if (!pgd_present(READ_ONCE(*pgdp))) return NULL; p4dp = p4d_offset(pgdp, addr); if (!p4d_present(READ_ONCE(*p4dp))) return NULL; pudp = pud_offset(p4dp, addr); pud = READ_ONCE(*pudp); if (sz != PUD_SIZE && pud_none(pud)) return NULL; /* hugepage or swap? */ if (pud_leaf(pud) || !pud_present(pud)) return (pte_t *)pudp; /* table; check the next level */ if (sz == CONT_PMD_SIZE) addr &= CONT_PMD_MASK; pmdp = pmd_offset(pudp, addr); pmd = READ_ONCE(*pmdp); if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) && pmd_none(pmd)) return NULL; if (pmd_leaf(pmd) || !pmd_present(pmd)) return (pte_t *)pmdp; if (sz == CONT_PTE_SIZE) return pte_offset_huge(pmdp, (addr & CONT_PTE_MASK)); return NULL; } unsigned long hugetlb_mask_last_page(struct hstate *h) { unsigned long hp_size = huge_page_size(h); switch (hp_size) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: if (pud_sect_supported()) return PGDIR_SIZE - PUD_SIZE; break; #endif case CONT_PMD_SIZE: return PUD_SIZE - CONT_PMD_SIZE; case PMD_SIZE: return PUD_SIZE - PMD_SIZE; case CONT_PTE_SIZE: return PMD_SIZE - CONT_PTE_SIZE; default: break; } return 0UL; } pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { size_t pagesize = 1UL << shift; switch (pagesize) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: if (pud_sect_supported()) return pud_pte(pud_mkhuge(pte_pud(entry))); break; #endif case CONT_PMD_SIZE: return pmd_pte(pmd_mkhuge(pmd_mkcont(pte_pmd(entry)))); case PMD_SIZE: return pmd_pte(pmd_mkhuge(pte_pmd(entry))); case CONT_PTE_SIZE: return pte_mkcont(entry); default: break; } pr_warn("%s: unrecognized huge page size 0x%lx\n", __func__, pagesize); return entry; } void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { int i, ncontig; size_t pgsize; ncontig = num_contig_ptes(sz, &pgsize); for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) __pte_clear(mm, addr, ptep); } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { int ncontig; size_t pgsize; ncontig = num_contig_ptes(sz, &pgsize); return get_clear_contig(mm, addr, ptep, pgsize, ncontig); } /* * huge_ptep_set_access_flags will update access flags (dirty, accesssed) * and write permission. * * For a contiguous huge pte range we need to check whether or not write * permission has to change only on the first pte in the set. Then for * all the contiguous ptes we need to check whether or not there is a * discrepancy between dirty or young. */ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig) { int i; if (pte_write(pte) != pte_write(__ptep_get(ptep))) return 1; for (i = 0; i < ncontig; i++) { pte_t orig_pte = __ptep_get(ptep + i); if (pte_dirty(pte) != pte_dirty(orig_pte)) return 1; if (pte_young(pte) != pte_young(orig_pte)) return 1; } return 0; } int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { int ncontig; size_t pgsize = 0; struct mm_struct *mm = vma->vm_mm; pte_t orig_pte; VM_WARN_ON(!pte_present(pte)); if (!pte_cont(pte)) return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); VM_WARN_ON(!pte_present(orig_pte)); /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) pte = pte_mkdirty(pte); if (pte_young(orig_pte)) pte = pte_mkyoung(pte); __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); return 1; } void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { int ncontig; size_t pgsize; pte_t pte; pte = __ptep_get(ptep); VM_WARN_ON(!pte_present(pte)); if (!pte_cont(pte)) { __ptep_set_wrprotect(mm, addr, ptep); return; } ncontig = find_num_contig(mm, addr, ptep, &pgsize); pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); pte = pte_wrprotect(pte); __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { struct mm_struct *mm = vma->vm_mm; size_t pgsize; int ncontig; ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); } static int __init hugetlbpage_init(void) { /* * HugeTLB pages are supported on maximum four page table * levels (PUD, CONT PMD, PMD, CONT PTE) for a given base * page size, corresponding to hugetlb_add_hstate() calls * here. * * HUGE_MAX_HSTATE should at least match maximum supported * HugeTLB page sizes on the platform. Any new addition to * supported HugeTLB page sizes will also require changing * HUGE_MAX_HSTATE as well. */ BUILD_BUG_ON(HUGE_MAX_HSTATE < 4); if (pud_sect_supported()) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT); hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT); return 0; } arch_initcall(hugetlbpage_init); bool __init arch_hugetlb_valid_size(unsigned long size) { return __hugetlb_valid_size(size); } pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { unsigned long psize = huge_page_size(hstate_vma(vma)); if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings * when the permission changes from executable to non-executable * in cases where cpu is affected with errata #2645198. */ if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { unsigned long psize = huge_page_size(hstate_vma(vma)); set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); }
6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020 Facebook Inc. #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/udp_tunnel.h> #include <net/vxlan.h> enum udp_tunnel_nic_table_entry_flags { UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0), UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1), UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2), UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3), }; struct udp_tunnel_nic_table_entry { __be16 port; u8 type; u8 flags; u16 use_cnt; #define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX u8 hw_priv; }; /** * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled * @n_tables: number of tables under @entries * @missed: bitmap of tables which overflown * @entries: table of tables of ports currently offloaded */ struct udp_tunnel_nic { struct work_struct work; struct net_device *dev; u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; unsigned int n_tables; unsigned long missed; struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables); }; /* We ensure all work structs are done using driver state, but not the code. * We need a workqueue we can flush before module gets removed. */ static struct workqueue_struct *udp_tunnel_nic_workqueue; static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type) { switch (type) { case UDP_TUNNEL_TYPE_VXLAN: return "vxlan"; case UDP_TUNNEL_TYPE_GENEVE: return "geneve"; case UDP_TUNNEL_TYPE_VXLAN_GPE: return "vxlan-gpe"; default: return "unknown"; } } static bool udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt == 0 && !entry->flags; } static bool udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry) { return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN); } static bool udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry) { if (!udp_tunnel_nic_entry_is_free(entry)) entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN; } static void udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry) { entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN; } static bool udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry) { return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD | UDP_TUNNEL_NIC_ENTRY_DEL); } static void udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn, struct udp_tunnel_nic_table_entry *entry, unsigned int flag) { entry->flags |= flag; utn->need_sync = 1; } static void udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry, struct udp_tunnel_info *ti) { memset(ti, 0, sizeof(*ti)); ti->port = entry->port; ti->type = entry->type; ti->hw_priv = entry->hw_priv; } static bool udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return false; return true; } static bool udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; if (!utn->missed) return false; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!test_bit(i, &utn->missed)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j])) return true; } return false; } static void __udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; entry = &utn->entries[table][idx]; if (entry->use_cnt) udp_tunnel_nic_ti_from_entry(entry, ti); } static void __udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv; } static void udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry, int err) { bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD && (!err || (err == -EEXIST && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD; if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL && (!err || (err == -ENOENT && dodgy))) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL; if (!err) entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL; else entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL; } static void udp_tunnel_nic_device_sync_one(struct net_device *dev, struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx) { struct udp_tunnel_nic_table_entry *entry; struct udp_tunnel_info ti; int err; entry = &utn->entries[table][idx]; if (!udp_tunnel_nic_entry_is_queued(entry)) return; udp_tunnel_nic_ti_from_entry(entry, &ti); if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD) err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti); else err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx, &ti); udp_tunnel_nic_entry_update_done(entry, err); if (err) netdev_warn(dev, "UDP tunnel port sync failed port %d type %s: %d\n", be16_to_cpu(entry->port), udp_tunnel_nic_tunnel_type_name(entry->type), err); } static void udp_tunnel_nic_device_sync_by_port(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_device_sync_one(dev, utn, i, j); } static void udp_tunnel_nic_device_sync_by_table(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; int err; for (i = 0; i < utn->n_tables; i++) { /* Find something that needs sync in this table */ for (j = 0; j < info->tables[i].n_entries; j++) if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j])) break; if (j == info->tables[i].n_entries) continue; err = info->sync_table(dev, i); if (err) netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n", i, err); for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (udp_tunnel_nic_entry_is_queued(entry)) udp_tunnel_nic_entry_update_done(entry, err); } } } static void __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { if (!utn->need_sync) return; if (dev->udp_tunnel_nic_info->sync_table) udp_tunnel_nic_device_sync_by_table(dev, utn); else udp_tunnel_nic_device_sync_by_port(dev, utn); utn->need_sync = 0; /* Can't replay directly here, in case we come from the tunnel driver's * notification - trying to replay may deadlock inside tunnel driver. */ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn); } static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; bool may_sleep; if (!utn->need_sync) return; /* Drivers which sleep in the callback need to update from * the workqueue, if we come from the tunnel driver's notification. */ may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (!may_sleep) __udp_tunnel_nic_device_sync(dev, utn); if (may_sleep || utn->need_replay) { queue_work(udp_tunnel_nic_workqueue, &utn->work); utn->work_pending = 1; } } static bool udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table, struct udp_tunnel_info *ti) { return table->tunnel_types & ti->type; } static bool udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i; /* Special case IPv4-only NICs */ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY && ti->sa_family != AF_INET) return false; for (i = 0; i < utn->n_tables; i++) if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti)) return true; return false; } static int udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_table_entry *entry; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry) && entry->port == ti->port && entry->type != ti->type) { __set_bit(i, &utn->missed); return true; } } return false; } static void udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL; unsigned int from, to; WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX); /* If not going from used to unused or vice versa - all done. * For dodgy entries make sure we try to sync again (queue the entry). */ entry->use_cnt += use_cnt_adj; if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj)) return; /* Cancel the op before it was sent to the device, if possible, * otherwise we'd need to take special care to issue commands * in the same order the ports arrived. */ if (use_cnt_adj < 0) { from = UDP_TUNNEL_NIC_ENTRY_ADD; to = UDP_TUNNEL_NIC_ENTRY_DEL; } else { from = UDP_TUNNEL_NIC_ENTRY_DEL; to = UDP_TUNNEL_NIC_ENTRY_ADD; } if (entry->flags & from) { entry->flags &= ~from; if (!dodgy) return; } udp_tunnel_nic_entry_queue(utn, entry, to); } static bool udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn, unsigned int table, unsigned int idx, struct udp_tunnel_info *ti, int use_cnt_adj) { struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx]; if (udp_tunnel_nic_entry_is_free(entry) || entry->port != ti->port || entry->type != ti->type) return false; if (udp_tunnel_nic_entry_is_frozen(entry)) return true; udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj); return true; } /* Try to find existing matching entry and adjust its use count, instead of * adding a new one. Returns true if entry was found. In case of delete the * entry may have gotten removed in the process, in which case it will be * queued for removal. */ static bool udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti, int use_cnt_adj) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti, use_cnt_adj)) return true; } return false; } static bool udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, +1); } static bool udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { return udp_tunnel_nic_try_existing(dev, utn, ti, -1); } static bool udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_table_info *table; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) { table = &dev->udp_tunnel_nic_info->tables[i]; if (!udp_tunnel_nic_table_is_capable(table, ti)) continue; for (j = 0; j < table->n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; if (!udp_tunnel_nic_entry_is_free(entry)) continue; entry->port = ti->port; entry->type = ti->type; entry->use_cnt = 1; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); return true; } /* The different table may still fit this port in, but there * are no devices currently which have multiple tables accepting * the same tunnel type, and false positives are okay. */ __set_bit(i, &utn->missed); } return false; } static void __udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY) return; if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN && ti->port == htons(IANA_VXLAN_UDP_PORT)) { if (ti->type != UDP_TUNNEL_TYPE_VXLAN) netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n"); return; } if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; /* It may happen that a tunnel of one type is removed and different * tunnel type tries to reuse its port before the device was informed. * Rely on utn->missed to re-add this port later. */ if (udp_tunnel_nic_has_collision(dev, utn, ti)) return; if (!udp_tunnel_nic_add_existing(dev, utn, ti)) udp_tunnel_nic_add_new(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti) { struct udp_tunnel_nic *utn; utn = dev->udp_tunnel_nic; if (!utn) return; if (!udp_tunnel_nic_is_capable(dev, utn, ti)) return; udp_tunnel_nic_del_existing(dev, utn, ti); udp_tunnel_nic_device_sync(dev, utn); } static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int i, j; ASSERT_RTNL(); utn = dev->udp_tunnel_nic; if (!utn) return; utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { struct udp_tunnel_nic_table_entry *entry; entry = &utn->entries[i][j]; entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); /* We don't release rtnl across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; udp_tunnel_nic_entry_queue(utn, entry, UDP_TUNNEL_NIC_ENTRY_ADD); } __udp_tunnel_nic_device_sync(dev, utn); } static size_t __udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; unsigned int j; size_t size; utn = dev->udp_tunnel_nic; if (!utn) return 0; size = 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; size += nla_total_size(0) + /* _TABLE_ENTRY */ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */ } return size; } static int __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic *utn; struct nlattr *nest; unsigned int j; utn = dev->udp_tunnel_nic; if (!utn) return 0; for (j = 0; j < info->tables[table].n_entries; j++) { if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j])) continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE, ilog2(utn->entries[table][j].type))) goto err_cancel; nla_nest_end(skb, nest); } return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, .add_port = __udp_tunnel_nic_add_port, .del_port = __udp_tunnel_nic_del_port, .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, }; static void udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; unsigned int i, j; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { int adj_cnt = -utn->entries[i][j].use_cnt; if (adj_cnt) udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt); } __udp_tunnel_nic_device_sync(dev, utn); for (i = 0; i < utn->n_tables; i++) memset(utn->entries[i], 0, array_size(info->tables[i].n_entries, sizeof(**utn->entries))); WARN_ON(utn->need_sync); utn->need_replay = 0; } static void udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node; unsigned int i, j; /* Freeze all the ports we are already tracking so that the replay * does not double up the refcount. */ for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]); utn->missed = 0; utn->need_replay = 0; if (!info->shared) { udp_tunnel_get_rx_info(dev); } else { list_for_each_entry(node, &info->shared->devices, list) udp_tunnel_get_rx_info(node->dev); } for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]); } static void udp_tunnel_nic_device_sync_work(struct work_struct *work) { struct udp_tunnel_nic *utn = container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); rtnl_unlock(); } static struct udp_tunnel_nic * udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, unsigned int n_tables) { struct udp_tunnel_nic *utn; unsigned int i; utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL); if (!utn) return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, sizeof(*utn->entries[i]), GFP_KERNEL); if (!utn->entries[i]) goto err_free_prev_entries; } return utn; err_free_prev_entries: while (i--) kfree(utn->entries[i]); kfree(utn); return NULL; } static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn) { unsigned int i; for (i = 0; i < utn->n_tables; i++) kfree(utn->entries[i]); kfree(utn); } static int udp_tunnel_nic_register(struct net_device *dev) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; struct udp_tunnel_nic_shared_node *node = NULL; struct udp_tunnel_nic *utn; unsigned int n_tables, i; BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE < UDP_TUNNEL_NIC_MAX_TABLES); /* Expect use count of at most 2 (IPv4, IPv6) per device */ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX < UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2); /* Check that the driver info is sane */ if (WARN_ON(!info->set_port != !info->unset_port) || WARN_ON(!info->set_port == !info->sync_table) || WARN_ON(!info->tables[0].n_entries)) return -EINVAL; if (WARN_ON(info->shared && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return -EINVAL; n_tables = 1; for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) { if (!info->tables[i].n_entries) continue; n_tables++; if (WARN_ON(!info->tables[i - 1].n_entries)) return -EINVAL; } /* Create UDP tunnel state structures */ if (info->shared) { node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; node->dev = dev; } if (info->shared && info->shared->udp_tunnel_nic_info) { utn = info->shared->udp_tunnel_nic_info; } else { utn = udp_tunnel_nic_alloc(info, n_tables); if (!utn) { kfree(node); return -ENOMEM; } } if (info->shared) { if (!info->shared->udp_tunnel_nic_info) { INIT_LIST_HEAD(&info->shared->devices); info->shared->udp_tunnel_nic_info = utn; } list_add_tail(&node->list, &info->shared->devices); } utn->dev = dev; dev_hold(dev); dev->udp_tunnel_nic = utn; if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) udp_tunnel_get_rx_info(dev); return 0; } static void udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ if (info->shared) { struct udp_tunnel_nic_shared_node *node, *first; list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; if (list_entry_is_head(node, &info->shared->devices, list)) return; list_del(&node->list); kfree(node); first = list_first_entry_or_null(&info->shared->devices, typeof(*first), list); if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; goto release_dev; } info->shared->udp_tunnel_nic_info = NULL; } /* Flush before we check work, so we don't waste time adding entries * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. */ if (utn->work_pending) return; udp_tunnel_nic_free(utn); release_dev: dev->udp_tunnel_nic = NULL; dev_put(dev); } static int udp_tunnel_nic_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); const struct udp_tunnel_nic_info *info; struct udp_tunnel_nic *utn; info = dev->udp_tunnel_nic_info; if (!info) return NOTIFY_DONE; if (event == NETDEV_REGISTER) { int err; err = udp_tunnel_nic_register(dev); if (err) netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ utn = dev->udp_tunnel_nic; if (!utn) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { udp_tunnel_nic_unregister(dev, utn); return NOTIFY_OK; } /* All other events only matter if NIC has to be programmed open */ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) return NOTIFY_DONE; if (event == NETDEV_UP) { WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { udp_tunnel_nic_flush(dev, utn); return NOTIFY_OK; } return NOTIFY_DONE; } static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = { .notifier_call = udp_tunnel_nic_netdevice_event, }; static int __init udp_tunnel_nic_init_module(void) { int err; udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; rtnl_lock(); udp_tunnel_nic_ops = &__udp_tunnel_nic_ops; rtnl_unlock(); err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block); if (err) goto err_unset_ops; return 0; err_unset_ops: rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); return err; } late_initcall(udp_tunnel_nic_init_module); static void __exit udp_tunnel_nic_cleanup_module(void) { unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block); rtnl_lock(); udp_tunnel_nic_ops = NULL; rtnl_unlock(); destroy_workqueue(udp_tunnel_nic_workqueue); } module_exit(udp_tunnel_nic_cleanup_module); MODULE_LICENSE("GPL");
150 151 151 151 151 98 96 151 553 549 553 552 552 362 403 552 854 856 857 858 856 182 575 151 552 769 349 552 553 552 552 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ #include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> #include <linux/swap.h> #include <linux/swapops.h> #undef pr_fmt #define pr_fmt(fmt) "page_table_check: " fmt struct page_table_check { atomic_t anon_map_count; atomic_t file_map_count; }; static bool __page_table_check_enabled __initdata = IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); EXPORT_SYMBOL(page_table_check_disabled); static int __init early_page_table_check_param(char *buf) { return kstrtobool(buf, &__page_table_check_enabled); } early_param("page_table_check", early_page_table_check_param); static bool __init need_page_table_check(void) { return __page_table_check_enabled; } static void __init init_page_table_check(void) { if (!__page_table_check_enabled) return; static_branch_disable(&page_table_check_disabled); } struct page_ext_operations page_table_check_ops = { .size = sizeof(struct page_table_check), .need = need_page_table_check, .init = init_page_table_check, .need_shared_flags = false, }; static struct page_table_check *get_page_table_check(struct page_ext *page_ext) { BUG_ON(!page_ext); return page_ext_data(page_ext, &page_table_check_ops); } /* * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) { struct page_ext_iter iter; struct page_ext *page_ext; struct page *page; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); BUG_ON(PageSlab(page)); anon = PageAnon(page); rcu_read_lock(); for_each_page_ext(page, pgcnt, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); } } rcu_read_unlock(); } /* * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, bool rw) { struct page_ext_iter iter; struct page_ext *page_ext; struct page *page; bool anon; if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); BUG_ON(PageSlab(page)); anon = PageAnon(page); rcu_read_lock(); for_each_page_ext(page, pgcnt, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); if (anon) { BUG_ON(atomic_read(&ptc->file_map_count)); BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); } else { BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); } } rcu_read_unlock(); } /* * page is on free list, or is being allocated, verify that counters are zeroes * crash if they are not. */ void __page_table_check_zero(struct page *page, unsigned int order) { struct page_ext_iter iter; struct page_ext *page_ext; BUG_ON(PageSlab(page)); rcu_read_lock(); for_each_page_ext(page, 1 << order, page_ext, iter) { struct page_table_check *ptc = get_page_table_check(page_ext); BUG_ON(atomic_read(&ptc->anon_map_count)); BUG_ON(atomic_read(&ptc->file_map_count)); } rcu_read_unlock(); } void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (&init_mm == mm) return; if (pte_user_accessible_page(pte)) { page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pte_clear); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (&init_mm == mm) return; if (pmd_user_accessible_page(pmd)) { page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (&init_mm == mm) return; if (pud_user_accessible_page(pud)) { page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); /* Whether the swap entry cached writable information */ static inline bool swap_cached_writable(swp_entry_t entry) { return is_writable_device_private_entry(entry) || is_writable_migration_entry(entry); } static inline void page_table_check_pte_flags(pte_t pte) { if (pte_present(pte) && pte_uffd_wp(pte)) WARN_ON_ONCE(pte_write(pte)); else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte)) WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte))); } void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, unsigned int nr) { unsigned int i; if (&init_mm == mm) return; page_table_check_pte_flags(pte); for (i = 0; i < nr; i++) __page_table_check_pte_clear(mm, ptep_get(ptep + i)); if (pte_user_accessible_page(pte)) page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } EXPORT_SYMBOL(__page_table_check_ptes_set); static inline void page_table_check_pmd_flags(pmd_t pmd) { if (pmd_present(pmd) && pmd_uffd_wp(pmd)) WARN_ON_ONCE(pmd_write(pmd)); else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd)) WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd))); } void __page_table_check_pmds_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd, unsigned int nr) { unsigned long stride = PMD_SIZE >> PAGE_SHIFT; unsigned int i; if (&init_mm == mm) return; page_table_check_pmd_flags(pmd); for (i = 0; i < nr; i++) __page_table_check_pmd_clear(mm, *(pmdp + i)); if (pmd_user_accessible_page(pmd)) page_table_check_set(pmd_pfn(pmd), stride * nr, pmd_write(pmd)); } EXPORT_SYMBOL(__page_table_check_pmds_set); void __page_table_check_puds_set(struct mm_struct *mm, pud_t *pudp, pud_t pud, unsigned int nr) { unsigned long stride = PUD_SIZE >> PAGE_SHIFT; unsigned int i; if (&init_mm == mm) return; for (i = 0; i < nr; i++) __page_table_check_pud_clear(mm, *(pudp + i)); if (pud_user_accessible_page(pud)) page_table_check_set(pud_pfn(pud), stride * nr, pud_write(pud)); } EXPORT_SYMBOL(__page_table_check_puds_set); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd) { if (&init_mm == mm) return; if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { pte_t *ptep = pte_offset_map(&pmd, addr); unsigned long i; if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { __page_table_check_pte_clear(mm, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } pte_unmap(ptep - PTRS_PER_PTE); } }
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 // SPDX-License-Identifier: GPL-2.0 #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/netpoll.h> #include <linux/export.h> #include <net/gro.h> #include "vlan.h" bool vlan_do_receive(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; __be16 vlan_proto = skb->vlan_proto; u16 vlan_id = skb_vlan_tag_get_id(skb); struct net_device *vlan_dev; struct vlan_pcpu_stats *rx_stats; vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id); if (!vlan_dev) return false; skb = *skbp = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return false; if (unlikely(!(vlan_dev->flags & IFF_UP))) { kfree_skb(skb); *skbp = NULL; return false; } skb->dev = vlan_dev; if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { /* Our lower layer thinks this is not local, let's make sure. * This allows the VLAN to have a different MAC than the * underlying device, and still route correctly. */ if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr)) skb->pkt_type = PACKET_HOST; } if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) && !netif_is_macvlan_port(vlan_dev) && !netif_is_bridge_port(vlan_dev)) { unsigned int offset = skb->data - skb_mac_header(skb); /* * vlan_insert_tag expect skb->data pointing to mac header. * So change skb->data before calling it and change back to * original position later */ skb_push(skb, offset); skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto, skb->vlan_tci, skb->mac_len); if (!skb) return false; skb_pull(skb, offset + VLAN_HLEN); skb_reset_mac_len(skb); } skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci); __vlan_hwaccel_clear_tag(skb); rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats); u64_stats_update_begin(&rx_stats->syncp); u64_stats_inc(&rx_stats->rx_packets); u64_stats_add(&rx_stats->rx_bytes, skb->len); if (skb->pkt_type == PACKET_MULTICAST) u64_stats_inc(&rx_stats->rx_multicast); u64_stats_update_end(&rx_stats->syncp); return true; } /* Must be invoked with rcu_read_lock. */ struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev, __be16 vlan_proto, u16 vlan_id) { struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info); if (vlan_info) { return vlan_group_get_device(&vlan_info->grp, vlan_proto, vlan_id); } else { /* * Lower devices of master uppers (bonding, team) do not have * grp assigned to themselves. Grp is assigned to upper device * instead. */ struct net_device *upper_dev; upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) return __vlan_find_dev_deep_rcu(upper_dev, vlan_proto, vlan_id); } return NULL; } EXPORT_SYMBOL(__vlan_find_dev_deep_rcu); struct net_device *vlan_dev_real_dev(const struct net_device *dev) { struct net_device *ret = vlan_dev_priv(dev)->real_dev; while (is_vlan_dev(ret)) ret = vlan_dev_priv(ret)->real_dev; return ret; } EXPORT_SYMBOL(vlan_dev_real_dev); u16 vlan_dev_vlan_id(const struct net_device *dev) { return vlan_dev_priv(dev)->vlan_id; } EXPORT_SYMBOL(vlan_dev_vlan_id); __be16 vlan_dev_vlan_proto(const struct net_device *dev) { return vlan_dev_priv(dev)->vlan_proto; } EXPORT_SYMBOL(vlan_dev_vlan_proto); /* * vlan info and vid list */ static void vlan_group_free(struct vlan_group *grp) { int i, j; for (i = 0; i < VLAN_PROTO_NUM; i++) for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++) kfree(grp->vlan_devices_arrays[i][j]); } static void vlan_info_free(struct vlan_info *vlan_info) { vlan_group_free(&vlan_info->grp); kfree(vlan_info); } static void vlan_info_rcu_free(struct rcu_head *rcu) { vlan_info_free(container_of(rcu, struct vlan_info, rcu)); } static struct vlan_info *vlan_info_alloc(struct net_device *dev) { struct vlan_info *vlan_info; vlan_info = kzalloc(sizeof(struct vlan_info), GFP_KERNEL); if (!vlan_info) return NULL; vlan_info->real_dev = dev; INIT_LIST_HEAD(&vlan_info->vid_list); return vlan_info; } struct vlan_vid_info { struct list_head list; __be16 proto; u16 vid; int refcount; }; static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto) { if (proto == htons(ETH_P_8021Q) && dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) return true; if (proto == htons(ETH_P_8021AD) && dev->features & NETIF_F_HW_VLAN_STAG_FILTER) return true; return false; } static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info, __be16 proto, u16 vid) { struct vlan_vid_info *vid_info; list_for_each_entry(vid_info, &vlan_info->vid_list, list) { if (vid_info->proto == proto && vid_info->vid == vid) return vid_info; } return NULL; } static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid) { struct vlan_vid_info *vid_info; vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL); if (!vid_info) return NULL; vid_info->proto = proto; vid_info->vid = vid; return vid_info; } static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid) { if (!vlan_hw_filter_capable(dev, proto)) return 0; if (netif_device_present(dev)) return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid); else return -ENODEV; } static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid) { if (!vlan_hw_filter_capable(dev, proto)) return 0; if (netif_device_present(dev)) return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid); else return -ENODEV; } int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg) { struct vlan_vid_info *vid_info; struct vlan_info *vlan_info; struct net_device *vdev; int ret; ASSERT_RTNL(); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) return 0; list_for_each_entry(vid_info, &vlan_info->vid_list, list) { vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto, vid_info->vid); ret = action(vdev, vid_info->vid, arg); if (ret) return ret; } return 0; } EXPORT_SYMBOL(vlan_for_each); int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto) { struct net_device *real_dev = vlan_info->real_dev; struct vlan_vid_info *vlan_vid_info; int err; list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) { if (vlan_vid_info->proto == proto) { err = vlan_add_rx_filter_info(real_dev, proto, vlan_vid_info->vid); if (err) goto unwind; } } return 0; unwind: list_for_each_entry_continue_reverse(vlan_vid_info, &vlan_info->vid_list, list) { if (vlan_vid_info->proto == proto) vlan_kill_rx_filter_info(real_dev, proto, vlan_vid_info->vid); } return err; } EXPORT_SYMBOL(vlan_filter_push_vids); void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto) { struct vlan_vid_info *vlan_vid_info; list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) if (vlan_vid_info->proto == proto) vlan_kill_rx_filter_info(vlan_info->real_dev, vlan_vid_info->proto, vlan_vid_info->vid); } EXPORT_SYMBOL(vlan_filter_drop_vids); static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid, struct vlan_vid_info **pvid_info) { struct net_device *dev = vlan_info->real_dev; struct vlan_vid_info *vid_info; int err; vid_info = vlan_vid_info_alloc(proto, vid); if (!vid_info) return -ENOMEM; err = vlan_add_rx_filter_info(dev, proto, vid); if (err) { kfree(vid_info); return err; } list_add(&vid_info->list, &vlan_info->vid_list); vlan_info->nr_vids++; *pvid_info = vid_info; return 0; } int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid) { struct vlan_info *vlan_info; struct vlan_vid_info *vid_info; bool vlan_info_created = false; int err; ASSERT_RTNL(); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) { vlan_info = vlan_info_alloc(dev); if (!vlan_info) return -ENOMEM; vlan_info_created = true; } vid_info = vlan_vid_info_get(vlan_info, proto, vid); if (!vid_info) { err = __vlan_vid_add(vlan_info, proto, vid, &vid_info); if (err) goto out_free_vlan_info; } vid_info->refcount++; if (vlan_info_created) rcu_assign_pointer(dev->vlan_info, vlan_info); return 0; out_free_vlan_info: if (vlan_info_created) kfree(vlan_info); return err; } EXPORT_SYMBOL(vlan_vid_add); static void __vlan_vid_del(struct vlan_info *vlan_info, struct vlan_vid_info *vid_info) { struct net_device *dev = vlan_info->real_dev; __be16 proto = vid_info->proto; u16 vid = vid_info->vid; int err; err = vlan_kill_rx_filter_info(dev, proto, vid); if (err && dev->reg_state != NETREG_UNREGISTERING) netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid); list_del(&vid_info->list); kfree(vid_info); vlan_info->nr_vids--; } void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid) { struct vlan_info *vlan_info; struct vlan_vid_info *vid_info; ASSERT_RTNL(); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) return; vid_info = vlan_vid_info_get(vlan_info, proto, vid); if (!vid_info) return; vid_info->refcount--; if (vid_info->refcount == 0) { __vlan_vid_del(vlan_info, vid_info); if (vlan_info->nr_vids == 0) { RCU_INIT_POINTER(dev->vlan_info, NULL); call_rcu(&vlan_info->rcu, vlan_info_rcu_free); } } } EXPORT_SYMBOL(vlan_vid_del); int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev) { struct vlan_vid_info *vid_info; struct vlan_info *vlan_info; int err; ASSERT_RTNL(); vlan_info = rtnl_dereference(by_dev->vlan_info); if (!vlan_info) return 0; list_for_each_entry(vid_info, &vlan_info->vid_list, list) { if (!vlan_hw_filter_capable(by_dev, vid_info->proto)) continue; err = vlan_vid_add(dev, vid_info->proto, vid_info->vid); if (err) goto unwind; } return 0; unwind: list_for_each_entry_continue_reverse(vid_info, &vlan_info->vid_list, list) { if (!vlan_hw_filter_capable(by_dev, vid_info->proto)) continue; vlan_vid_del(dev, vid_info->proto, vid_info->vid); } return err; } EXPORT_SYMBOL(vlan_vids_add_by_dev); void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev) { struct vlan_vid_info *vid_info; struct vlan_info *vlan_info; ASSERT_RTNL(); vlan_info = rtnl_dereference(by_dev->vlan_info); if (!vlan_info) return; list_for_each_entry(vid_info, &vlan_info->vid_list, list) { if (!vlan_hw_filter_capable(by_dev, vid_info->proto)) continue; vlan_vid_del(dev, vid_info->proto, vid_info->vid); } } EXPORT_SYMBOL(vlan_vids_del_by_dev); bool vlan_uses_dev(const struct net_device *dev) { struct vlan_info *vlan_info; ASSERT_RTNL(); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) return false; return vlan_info->grp.nr_vlan_devs ? true : false; } EXPORT_SYMBOL(vlan_uses_dev); static struct sk_buff *vlan_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct packet_offload *ptype; unsigned int hlen, off_vlan; struct sk_buff *pp = NULL; struct vlan_hdr *vhdr; struct sk_buff *p; __be16 type; int flush = 1; off_vlan = skb_gro_offset(skb); hlen = off_vlan + sizeof(*vhdr); vhdr = skb_gro_header(skb, hlen, off_vlan); if (unlikely(!vhdr)) goto out; NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen; type = vhdr->h_vlan_encapsulated_proto; ptype = gro_find_receive_by_type(type); if (!ptype) goto out; flush = 0; list_for_each_entry(p, head, list) { struct vlan_hdr *vhdr2; if (!NAPI_GRO_CB(p)->same_flow) continue; vhdr2 = (struct vlan_hdr *)(p->data + off_vlan); if (compare_vlan_header(vhdr, vhdr2)) NAPI_GRO_CB(p)->same_flow = 0; } skb_gro_pull(skb, sizeof(*vhdr)); skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr)); pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static int vlan_gro_complete(struct sk_buff *skb, int nhoff) { struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff); __be16 type = vhdr->h_vlan_encapsulated_proto; struct packet_offload *ptype; int err = -ENOENT; ptype = gro_find_complete_by_type(type); if (ptype) err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, ipv6_gro_complete, inet_gro_complete, skb, nhoff + sizeof(*vhdr)); return err; } static struct packet_offload vlan_packet_offloads[] __read_mostly = { { .type = cpu_to_be16(ETH_P_8021Q), .priority = 10, .callbacks = { .gro_receive = vlan_gro_receive, .gro_complete = vlan_gro_complete, }, }, { .type = cpu_to_be16(ETH_P_8021AD), .priority = 10, .callbacks = { .gro_receive = vlan_gro_receive, .gro_complete = vlan_gro_complete, }, }, }; static int __init vlan_offload_init(void) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++) dev_add_offload(&vlan_packet_offloads[i]); return 0; } fs_initcall(vlan_offload_init);
33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/ptrace.c * * By Ross Biro 1/23/92 * edited by Linus Torvalds * ARM modifications Copyright (C) 2000 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/audit.h> #include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/nospec.h> #include <linux/smp.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/seccomp.h> #include <linux/security.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/string.h> #include <linux/uaccess.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/regset.h> #include <linux/elf.h> #include <linux/rseq.h> #include <asm/compat.h> #include <asm/cpufeature.h> #include <asm/debug-monitors.h> #include <asm/fpsimd.h> #include <asm/gcs.h> #include <asm/mte.h> #include <asm/pointer_auth.h> #include <asm/stacktrace.h> #include <asm/syscall.h> #include <asm/traps.h> #include <asm/system_misc.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> struct pt_regs_offset { const char *name; int offset; }; #define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} #define REG_OFFSET_END {.name = NULL, .offset = 0} #define GPR_OFFSET_NAME(r) \ {.name = "x" #r, .offset = offsetof(struct pt_regs, regs[r])} static const struct pt_regs_offset regoffset_table[] = { GPR_OFFSET_NAME(0), GPR_OFFSET_NAME(1), GPR_OFFSET_NAME(2), GPR_OFFSET_NAME(3), GPR_OFFSET_NAME(4), GPR_OFFSET_NAME(5), GPR_OFFSET_NAME(6), GPR_OFFSET_NAME(7), GPR_OFFSET_NAME(8), GPR_OFFSET_NAME(9), GPR_OFFSET_NAME(10), GPR_OFFSET_NAME(11), GPR_OFFSET_NAME(12), GPR_OFFSET_NAME(13), GPR_OFFSET_NAME(14), GPR_OFFSET_NAME(15), GPR_OFFSET_NAME(16), GPR_OFFSET_NAME(17), GPR_OFFSET_NAME(18), GPR_OFFSET_NAME(19), GPR_OFFSET_NAME(20), GPR_OFFSET_NAME(21), GPR_OFFSET_NAME(22), GPR_OFFSET_NAME(23), GPR_OFFSET_NAME(24), GPR_OFFSET_NAME(25), GPR_OFFSET_NAME(26), GPR_OFFSET_NAME(27), GPR_OFFSET_NAME(28), GPR_OFFSET_NAME(29), GPR_OFFSET_NAME(30), {.name = "lr", .offset = offsetof(struct pt_regs, regs[30])}, REG_OFFSET_NAME(sp), REG_OFFSET_NAME(pc), REG_OFFSET_NAME(pstate), REG_OFFSET_END, }; /** * regs_query_register_offset() - query register offset from its name * @name: the name of a register * * regs_query_register_offset() returns the offset of a register in struct * pt_regs from its name. If the name is invalid, this returns -EINVAL; */ int regs_query_register_offset(const char *name) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (!strcmp(roff->name, name)) return roff->offset; return -EINVAL; } /** * regs_within_kernel_stack() - check the address in the stack * @regs: pt_regs which contains kernel stack pointer. * @addr: address which is checked. * * regs_within_kernel_stack() checks @addr is within the kernel stack page(s). * If @addr is within the kernel stack, it returns true. If not, returns false. */ static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || on_irq_stack(addr, sizeof(unsigned long)); } /** * regs_get_kernel_stack_nth() - get Nth entry of the stack * @regs: pt_regs which contains kernel stack pointer. * @n: stack entry number. * * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which * is specified by @regs. If the @n th entry is NOT in the kernel stack, * this returns 0. */ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) { unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) return READ_ONCE_NOCHECK(*addr); else return 0; } /* * TODO: does not yet catch signals sent when the child dies. * in exit.c or in signal.c. */ /* * Called by kernel/ptrace.c when detaching.. */ void ptrace_disable(struct task_struct *child) { /* * This would be better off in core code, but PTRACE_DETACH has * grown its fair share of arch-specific worts and changing it * is likely to cause regressions on obscure architectures. */ user_disable_single_step(child); } #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * Handle hitting a HW-breakpoint. */ static void ptrace_hbptriggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); const char *desc = "Hardware breakpoint trap (ptrace)"; if (is_compat_task()) { int si_errno = 0; int i; for (i = 0; i < ARM_MAX_BRP; ++i) { if (current->thread.debug.hbp_break[i] == bp) { si_errno = (i << 1) + 1; break; } } for (i = 0; i < ARM_MAX_WRP; ++i) { if (current->thread.debug.hbp_watch[i] == bp) { si_errno = -((i << 1) + 1); break; } } arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); return; } arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* * Unregister breakpoints from this task and reset the pointers in * the thread_struct. */ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { int i; struct thread_struct *t = &tsk->thread; for (i = 0; i < ARM_MAX_BRP; i++) { if (t->debug.hbp_break[i]) { unregister_hw_breakpoint(t->debug.hbp_break[i]); t->debug.hbp_break[i] = NULL; } } for (i = 0; i < ARM_MAX_WRP; i++) { if (t->debug.hbp_watch[i]) { unregister_hw_breakpoint(t->debug.hbp_watch[i]); t->debug.hbp_watch[i] = NULL; } } } void ptrace_hw_copy_thread(struct task_struct *tsk) { memset(&tsk->thread.debug, 0, sizeof(struct debug_info)); } static struct perf_event *ptrace_hbp_get_event(unsigned int note_type, struct task_struct *tsk, unsigned long idx) { struct perf_event *bp = ERR_PTR(-EINVAL); switch (note_type) { case NT_ARM_HW_BREAK: if (idx >= ARM_MAX_BRP) goto out; idx = array_index_nospec(idx, ARM_MAX_BRP); bp = tsk->thread.debug.hbp_break[idx]; break; case NT_ARM_HW_WATCH: if (idx >= ARM_MAX_WRP) goto out; idx = array_index_nospec(idx, ARM_MAX_WRP); bp = tsk->thread.debug.hbp_watch[idx]; break; } out: return bp; } static int ptrace_hbp_set_event(unsigned int note_type, struct task_struct *tsk, unsigned long idx, struct perf_event *bp) { int err = -EINVAL; switch (note_type) { case NT_ARM_HW_BREAK: if (idx >= ARM_MAX_BRP) goto out; idx = array_index_nospec(idx, ARM_MAX_BRP); tsk->thread.debug.hbp_break[idx] = bp; err = 0; break; case NT_ARM_HW_WATCH: if (idx >= ARM_MAX_WRP) goto out; idx = array_index_nospec(idx, ARM_MAX_WRP); tsk->thread.debug.hbp_watch[idx] = bp; err = 0; break; } out: return err; } static struct perf_event *ptrace_hbp_create(unsigned int note_type, struct task_struct *tsk, unsigned long idx) { struct perf_event *bp; struct perf_event_attr attr; int err, type; switch (note_type) { case NT_ARM_HW_BREAK: type = HW_BREAKPOINT_X; break; case NT_ARM_HW_WATCH: type = HW_BREAKPOINT_RW; break; default: return ERR_PTR(-EINVAL); } ptrace_breakpoint_init(&attr); /* * Initialise fields to sane defaults * (i.e. values that will pass validation). */ attr.bp_addr = 0; attr.bp_len = HW_BREAKPOINT_LEN_4; attr.bp_type = type; attr.disabled = 1; bp = register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL, tsk); if (IS_ERR(bp)) return bp; err = ptrace_hbp_set_event(note_type, tsk, idx, bp); if (err) return ERR_PTR(err); return bp; } static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { int err, len, type, offset, disabled = !ctrl.enabled; attr->disabled = disabled; if (disabled) return 0; err = arch_bp_generic_fields(ctrl, &len, &type, &offset); if (err) return err; switch (note_type) { case NT_ARM_HW_BREAK: if ((type & HW_BREAKPOINT_X) != type) return -EINVAL; break; case NT_ARM_HW_WATCH: if ((type & HW_BREAKPOINT_RW) != type) return -EINVAL; break; default: return -EINVAL; } attr->bp_len = len; attr->bp_type = type; attr->bp_addr += offset; return 0; } static int ptrace_hbp_get_resource_info(unsigned int note_type, u32 *info) { u8 num; u32 reg = 0; switch (note_type) { case NT_ARM_HW_BREAK: num = hw_breakpoint_slots(TYPE_INST); break; case NT_ARM_HW_WATCH: num = hw_breakpoint_slots(TYPE_DATA); break; default: return -EINVAL; } reg |= debug_monitors_arch(); reg <<= 8; reg |= num; *info = reg; return 0; } static int ptrace_hbp_get_ctrl(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u32 *ctrl) { struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); if (IS_ERR(bp)) return PTR_ERR(bp); *ctrl = bp ? encode_ctrl_reg(counter_arch_bp(bp)->ctrl) : 0; return 0; } static int ptrace_hbp_get_addr(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u64 *addr) { struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); if (IS_ERR(bp)) return PTR_ERR(bp); *addr = bp ? counter_arch_bp(bp)->address : 0; return 0; } static struct perf_event *ptrace_hbp_get_initialised_bp(unsigned int note_type, struct task_struct *tsk, unsigned long idx) { struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx); if (!bp) bp = ptrace_hbp_create(note_type, tsk, idx); return bp; } static int ptrace_hbp_set_ctrl(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u32 uctrl) { int err; struct perf_event *bp; struct perf_event_attr attr; struct arch_hw_breakpoint_ctrl ctrl; bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); if (IS_ERR(bp)) { err = PTR_ERR(bp); return err; } attr = bp->attr; decode_ctrl_reg(uctrl, &ctrl); err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr); if (err) return err; return modify_user_hw_breakpoint(bp, &attr); } static int ptrace_hbp_set_addr(unsigned int note_type, struct task_struct *tsk, unsigned long idx, u64 addr) { int err; struct perf_event *bp; struct perf_event_attr attr; bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx); if (IS_ERR(bp)) { err = PTR_ERR(bp); return err; } attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); return err; } #define PTRACE_HBP_ADDR_SZ sizeof(u64) #define PTRACE_HBP_CTRL_SZ sizeof(u32) #define PTRACE_HBP_PAD_SZ sizeof(u32) static int hw_break_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { unsigned int note_type = regset->core_note_type; int ret, idx = 0; u32 info, ctrl; u64 addr; /* Resource info */ ret = ptrace_hbp_get_resource_info(note_type, &info); if (ret) return ret; membuf_write(&to, &info, sizeof(info)); membuf_zero(&to, sizeof(u32)); /* (address, ctrl) registers */ while (to.left) { ret = ptrace_hbp_get_addr(note_type, target, idx, &addr); if (ret) return ret; ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl); if (ret) return ret; membuf_store(&to, addr); membuf_store(&to, ctrl); membuf_zero(&to, sizeof(u32)); idx++; } return 0; } static int hw_break_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { unsigned int note_type = regset->core_note_type; int ret, idx = 0, offset, limit; u32 ctrl; u64 addr; /* Resource info and pad */ offset = offsetof(struct user_hwdebug_state, dbg_regs); user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset); /* (address, ctrl) registers */ limit = regset->n * regset->size; while (count && offset < limit) { if (count < PTRACE_HBP_ADDR_SZ) return -EINVAL; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &addr, offset, offset + PTRACE_HBP_ADDR_SZ); if (ret) return ret; ret = ptrace_hbp_set_addr(note_type, target, idx, addr); if (ret) return ret; offset += PTRACE_HBP_ADDR_SZ; if (!count) break; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, offset, offset + PTRACE_HBP_CTRL_SZ); if (ret) return ret; ret = ptrace_hbp_set_ctrl(note_type, target, idx, ctrl); if (ret) return ret; offset += PTRACE_HBP_CTRL_SZ; user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, offset, offset + PTRACE_HBP_PAD_SZ); offset += PTRACE_HBP_PAD_SZ; idx++; } return 0; } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ static int gpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs; return membuf_write(&to, uregs, sizeof(*uregs)); } static int gpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; struct user_pt_regs newregs = task_pt_regs(target)->user_regs; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newregs, 0, -1); if (ret) return ret; if (!valid_user_regs(&newregs, target)) return -EINVAL; task_pt_regs(target)->user_regs = newregs; return 0; } static int fpr_active(struct task_struct *target, const struct user_regset *regset) { if (!system_supports_fpsimd()) return -ENODEV; return regset->n; } /* * TODO: update fp accessors for lazy context switching (sync/flush hwstate) */ static int __fpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_fpsimd_state *uregs; fpsimd_sync_from_effective_state(target); uregs = &target->thread.uw.fpsimd_state; return membuf_write(&to, uregs, sizeof(*uregs)); } static int fpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_fpsimd()) return -EINVAL; if (target == current) fpsimd_preserve_current_state(); return __fpr_get(target, regset, to); } static int __fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf, unsigned int start_pos) { int ret; struct user_fpsimd_state newstate; /* * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ fpsimd_sync_from_effective_state(target); newstate = target->thread.uw.fpsimd_state; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate, start_pos, start_pos + sizeof(newstate)); if (ret) return ret; target->thread.uw.fpsimd_state = newstate; return ret; } static int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; if (!system_supports_fpsimd()) return -EINVAL; ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0); if (ret) return ret; fpsimd_sync_to_effective_state_zeropad(target); fpsimd_flush_task_state(target); return ret; } static int tls_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int ret; if (target == current) tls_preserve_current_state(); ret = membuf_store(&to, target->thread.uw.tp_value); if (system_supports_tpidr2()) ret = membuf_store(&to, target->thread.tpidr2_el0); else ret = membuf_zero(&to, sizeof(u64)); return ret; } static int tls_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; unsigned long tls[2]; tls[0] = target->thread.uw.tp_value; if (system_supports_tpidr2()) tls[1] = target->thread.tpidr2_el0; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, tls, 0, count); if (ret) return ret; target->thread.uw.tp_value = tls[0]; if (system_supports_tpidr2()) target->thread.tpidr2_el0 = tls[1]; return ret; } static int fpmr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_fpmr()) return -EINVAL; if (target == current) fpsimd_preserve_current_state(); return membuf_store(&to, target->thread.uw.fpmr); } static int fpmr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; unsigned long fpmr; if (!system_supports_fpmr()) return -EINVAL; fpmr = target->thread.uw.fpmr; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count); if (ret) return ret; target->thread.uw.fpmr = fpmr; fpsimd_flush_task_state(target); return 0; } static int system_call_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { return membuf_store(&to, task_pt_regs(target)->syscallno); } static int system_call_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int syscallno = task_pt_regs(target)->syscallno; int ret; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &syscallno, 0, -1); if (ret) return ret; task_pt_regs(target)->syscallno = syscallno; return ret; } #ifdef CONFIG_ARM64_SVE static void sve_init_header_from_task(struct user_sve_header *header, struct task_struct *target, enum vec_type type) { unsigned int vq; bool active; enum vec_type task_type; memset(header, 0, sizeof(*header)); /* Check if the requested registers are active for the task */ if (thread_sm_enabled(&target->thread)) task_type = ARM64_VEC_SME; else task_type = ARM64_VEC_SVE; active = (task_type == type); if (active && target->thread.fp_type == FP_STATE_SVE) header->flags = SVE_PT_REGS_SVE; else header->flags = SVE_PT_REGS_FPSIMD; switch (type) { case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; break; case ARM64_VEC_SME: if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header->flags |= SVE_PT_VL_INHERIT; break; default: WARN_ON_ONCE(1); return; } header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); header->max_vl = vec_max_vl(type); if (active) header->size = SVE_PT_SIZE(vq, header->flags); else header->size = sizeof(header); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } static unsigned int sve_size_from_header(struct user_sve_header const *header) { return ALIGN(header->size, SVE_VQ_BYTES); } static int sve_get_common(struct task_struct *target, const struct user_regset *regset, struct membuf to, enum vec_type type) { struct user_sve_header header; unsigned int vq; unsigned long start, end; if (target == current) fpsimd_preserve_current_state(); /* Header */ sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); /* * When the requested vector type is not active, do not present data * from the other mode to userspace. */ if (header.size == sizeof(header)) return 0; switch ((header.flags & SVE_PT_REGS_MASK)) { case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); case SVE_PT_REGS_SVE: start = SVE_PT_SVE_OFFSET; end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); membuf_write(&to, target->thread.sve_state, end - start); start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); membuf_zero(&to, end - start); /* * Copy fpsr, and fpcr which must follow contiguously in * struct fpsimd_state: */ start = end; end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr, end - start); start = end; end = sve_size_from_header(&header); return membuf_zero(&to, end - start); default: BUILD_BUG(); } } static int sve_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_sve()) return -EINVAL; return sve_get_common(target, regset, to, ARM64_VEC_SVE); } static int sve_set_common(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf, enum vec_type type) { int ret; struct user_sve_header header; unsigned int vq; unsigned long start, end; bool fpsimd; fpsimd_flush_task_state(target); /* Header */ if (count < sizeof(header)) return -EINVAL; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) return ret; /* * Streaming SVE data is always stored and presented in SVE format. * Require the user to provide SVE formatted data for consistency, and * to avoid the risk that we configure the task into an invalid state. */ fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD; if (fpsimd && type == ARM64_VEC_SME) return -EINVAL; /* * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by * vec_set_vector_length(), which will also validate them for us: */ ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) return ret; /* Allocate SME storage if necessary, preserving any existing ZA/ZT state */ if (type == ARM64_VEC_SME) { sme_alloc(target, false); if (!target->thread.sme_state) return -ENOMEM; } /* Allocate SVE storage if necessary, zeroing any existing SVE state */ if (!fpsimd) { sve_alloc(target, true); if (!target->thread.sve_state) return -ENOMEM; } /* * Actual VL set may be different from what the user asked * for, or we may have configured the _ONEXEC VL not the * current VL: */ vq = sve_vq_from_vl(task_get_vl(target, type)); /* Enter/exit streaming mode */ if (system_supports_sme()) { switch (type) { case ARM64_VEC_SVE: target->thread.svcr &= ~SVCR_SM_MASK; set_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: target->thread.svcr |= SVCR_SM_MASK; set_tsk_thread_flag(target, TIF_SME); break; default: WARN_ON_ONCE(1); return -EINVAL; } } /* Always zero V regs, FPSR, and FPCR */ memset(&current->thread.uw.fpsimd_state, 0, sizeof(current->thread.uw.fpsimd_state)); /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); if (fpsimd) { clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, SVE_PT_FPSIMD_OFFSET); return ret; } /* Otherwise: no registers or full SVE case. */ target->thread.fp_type = FP_STATE_SVE; /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ if (count && vq != sve_vq_from_vl(header.vl)) return -EIO; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, target->thread.sve_state, start, end); if (ret) return ret; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, start, end); /* * Copy fpsr, and fpcr which must follow contiguously in * struct fpsimd_state: */ start = end; end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.uw.fpsimd_state.fpsr, start, end); return ret; } static int sve_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { if (!system_supports_sve()) return -EINVAL; return sve_set_common(target, regset, pos, count, kbuf, ubuf, ARM64_VEC_SVE); } #endif /* CONFIG_ARM64_SVE */ #ifdef CONFIG_ARM64_SME static int ssve_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_sme()) return -EINVAL; return sve_get_common(target, regset, to, ARM64_VEC_SME); } static int ssve_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { if (!system_supports_sme()) return -EINVAL; return sve_set_common(target, regset, pos, count, kbuf, ubuf, ARM64_VEC_SME); } static int za_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_za_header header; unsigned int vq; unsigned long start, end; if (!system_supports_sme()) return -EINVAL; /* Header */ memset(&header, 0, sizeof(header)); if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT)) header.flags |= ZA_PT_VL_INHERIT; header.vl = task_get_sme_vl(target); vq = sve_vq_from_vl(header.vl); header.max_vl = sme_max_vl(); header.max_size = ZA_PT_SIZE(vq); /* If ZA is not active there is only the header */ if (thread_za_enabled(&target->thread)) header.size = ZA_PT_SIZE(vq); else header.size = ZA_PT_ZA_OFFSET; membuf_write(&to, &header, sizeof(header)); BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); end = ZA_PT_ZA_OFFSET; if (target == current) fpsimd_preserve_current_state(); /* Any register data to include? */ if (thread_za_enabled(&target->thread)) { start = end; end = ZA_PT_SIZE(vq); membuf_write(&to, target->thread.sme_state, end - start); } /* Zero any trailing padding */ start = end; end = ALIGN(header.size, SVE_VQ_BYTES); return membuf_zero(&to, end - start); } static int za_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; struct user_za_header header; unsigned int vq; unsigned long start, end; if (!system_supports_sme()) return -EINVAL; /* Header */ if (count < sizeof(header)) return -EINVAL; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) goto out; /* * All current ZA_PT_* flags are consumed by * vec_set_vector_length(), which will also validate them for * us: */ ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl, ((unsigned long)header.flags) << 16); if (ret) goto out; /* * Actual VL set may be different from what the user asked * for, or we may have configured the _ONEXEC rather than * current VL: */ vq = sve_vq_from_vl(task_get_sme_vl(target)); /* Ensure there is some SVE storage for streaming mode */ if (!target->thread.sve_state) { sve_alloc(target, false); if (!target->thread.sve_state) { ret = -ENOMEM; goto out; } } /* * Only flush the storage if PSTATE.ZA was not already set, * otherwise preserve any existing data. */ sme_alloc(target, !thread_za_enabled(&target->thread)); if (!target->thread.sme_state) return -ENOMEM; /* If there is no data then disable ZA */ if (!count) { target->thread.svcr &= ~SVCR_ZA_MASK; goto out; } /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ if (vq != sve_vq_from_vl(header.vl)) { ret = -EIO; goto out; } BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header)); start = ZA_PT_ZA_OFFSET; end = ZA_PT_SIZE(vq); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, target->thread.sme_state, start, end); if (ret) goto out; /* Mark ZA as active and let userspace use it */ set_tsk_thread_flag(target, TIF_SME); target->thread.svcr |= SVCR_ZA_MASK; out: fpsimd_flush_task_state(target); return ret; } static int zt_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_sme2()) return -EINVAL; /* * If PSTATE.ZA is not set then ZT will be zeroed when it is * enabled so report the current register value as zero. */ if (thread_za_enabled(&target->thread)) membuf_write(&to, thread_zt_state(&target->thread), ZT_SIG_REG_BYTES); else membuf_zero(&to, ZT_SIG_REG_BYTES); return 0; } static int zt_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; if (!system_supports_sme2()) return -EINVAL; /* Ensure SVE storage in case this is first use of SME */ sve_alloc(target, false); if (!target->thread.sve_state) return -ENOMEM; if (!thread_za_enabled(&target->thread)) { sme_alloc(target, true); if (!target->thread.sme_state) return -ENOMEM; } ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, thread_zt_state(&target->thread), 0, ZT_SIG_REG_BYTES); if (ret == 0) { target->thread.svcr |= SVCR_ZA_MASK; set_tsk_thread_flag(target, TIF_SME); } fpsimd_flush_task_state(target); return ret; } #endif /* CONFIG_ARM64_SME */ #ifdef CONFIG_ARM64_PTR_AUTH static int pac_mask_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { /* * The PAC bits can differ across data and instruction pointers * depending on TCR_EL1.TBID*, which we may make use of in future, so * we expose separate masks. */ unsigned long mask = ptrauth_user_pac_mask(); struct user_pac_mask uregs = { .data_mask = mask, .insn_mask = mask, }; if (!system_supports_address_auth()) return -EINVAL; return membuf_write(&to, &uregs, sizeof(uregs)); } static int pac_enabled_keys_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { long enabled_keys = ptrauth_get_enabled_keys(target); if (IS_ERR_VALUE(enabled_keys)) return enabled_keys; return membuf_write(&to, &enabled_keys, sizeof(enabled_keys)); } static int pac_enabled_keys_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; long enabled_keys = ptrauth_get_enabled_keys(target); if (IS_ERR_VALUE(enabled_keys)) return enabled_keys; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0, sizeof(long)); if (ret) return ret; return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK, enabled_keys); } #ifdef CONFIG_CHECKPOINT_RESTORE static __uint128_t pac_key_to_user(const struct ptrauth_key *key) { return (__uint128_t)key->hi << 64 | key->lo; } static struct ptrauth_key pac_key_from_user(__uint128_t ukey) { struct ptrauth_key key = { .lo = (unsigned long)ukey, .hi = (unsigned long)(ukey >> 64), }; return key; } static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, const struct ptrauth_keys_user *keys) { ukeys->apiakey = pac_key_to_user(&keys->apia); ukeys->apibkey = pac_key_to_user(&keys->apib); ukeys->apdakey = pac_key_to_user(&keys->apda); ukeys->apdbkey = pac_key_to_user(&keys->apdb); } static void pac_address_keys_from_user(struct ptrauth_keys_user *keys, const struct user_pac_address_keys *ukeys) { keys->apia = pac_key_from_user(ukeys->apiakey); keys->apib = pac_key_from_user(ukeys->apibkey); keys->apda = pac_key_from_user(ukeys->apdakey); keys->apdb = pac_key_from_user(ukeys->apdbkey); } static int pac_address_keys_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_address_keys user_keys; if (!system_supports_address_auth()) return -EINVAL; pac_address_keys_to_user(&user_keys, keys); return membuf_write(&to, &user_keys, sizeof(user_keys)); } static int pac_address_keys_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_address_keys user_keys; int ret; if (!system_supports_address_auth()) return -EINVAL; pac_address_keys_to_user(&user_keys, keys); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_keys, 0, -1); if (ret) return ret; pac_address_keys_from_user(keys, &user_keys); return 0; } static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys, const struct ptrauth_keys_user *keys) { ukeys->apgakey = pac_key_to_user(&keys->apga); } static void pac_generic_keys_from_user(struct ptrauth_keys_user *keys, const struct user_pac_generic_keys *ukeys) { keys->apga = pac_key_from_user(ukeys->apgakey); } static int pac_generic_keys_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_generic_keys user_keys; if (!system_supports_generic_auth()) return -EINVAL; pac_generic_keys_to_user(&user_keys, keys); return membuf_write(&to, &user_keys, sizeof(user_keys)); } static int pac_generic_keys_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct ptrauth_keys_user *keys = &target->thread.keys_user; struct user_pac_generic_keys user_keys; int ret; if (!system_supports_generic_auth()) return -EINVAL; pac_generic_keys_to_user(&user_keys, keys); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_keys, 0, -1); if (ret) return ret; pac_generic_keys_from_user(keys, &user_keys); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI static int tagged_addr_ctrl_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { long ctrl = get_tagged_addr_ctrl(target); if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) return ctrl; return membuf_write(&to, &ctrl, sizeof(ctrl)); } static int tagged_addr_ctrl_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; long ctrl; ctrl = get_tagged_addr_ctrl(target); if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl))) return ctrl; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; return set_tagged_addr_ctrl(target, ctrl); } #endif #ifdef CONFIG_ARM64_POE static int poe_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { if (!system_supports_poe()) return -EINVAL; return membuf_write(&to, &target->thread.por_el0, sizeof(target->thread.por_el0)); } static int poe_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; long ctrl; if (!system_supports_poe()) return -EINVAL; ctrl = target->thread.por_el0; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1); if (ret) return ret; target->thread.por_el0 = ctrl; return 0; } #endif #ifdef CONFIG_ARM64_GCS static void task_gcs_to_user(struct user_gcs *user_gcs, const struct task_struct *target) { user_gcs->features_enabled = target->thread.gcs_el0_mode; user_gcs->features_locked = target->thread.gcs_el0_locked; user_gcs->gcspr_el0 = target->thread.gcspr_el0; } static void task_gcs_from_user(struct task_struct *target, const struct user_gcs *user_gcs) { target->thread.gcs_el0_mode = user_gcs->features_enabled; target->thread.gcs_el0_locked = user_gcs->features_locked; target->thread.gcspr_el0 = user_gcs->gcspr_el0; } static int gcs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_gcs user_gcs; if (!system_supports_gcs()) return -EINVAL; if (target == current) gcs_preserve_current_state(); task_gcs_to_user(&user_gcs, target); return membuf_write(&to, &user_gcs, sizeof(user_gcs)); } static int gcs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; struct user_gcs user_gcs; if (!system_supports_gcs()) return -EINVAL; task_gcs_to_user(&user_gcs, target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1); if (ret) return ret; if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK) return -EINVAL; task_gcs_from_user(target, &user_gcs); return 0; } #endif enum aarch64_regset { REGSET_GPR, REGSET_FPR, REGSET_TLS, #ifdef CONFIG_HAVE_HW_BREAKPOINT REGSET_HW_BREAK, REGSET_HW_WATCH, #endif REGSET_FPMR, REGSET_SYSTEM_CALL, #ifdef CONFIG_ARM64_SVE REGSET_SVE, #endif #ifdef CONFIG_ARM64_SME REGSET_SSVE, REGSET_ZA, REGSET_ZT, #endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, REGSET_PAC_ENABLED_KEYS, #ifdef CONFIG_CHECKPOINT_RESTORE REGSET_PACA_KEYS, REGSET_PACG_KEYS, #endif #endif #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI REGSET_TAGGED_ADDR_CTRL, #endif #ifdef CONFIG_ARM64_POE REGSET_POE, #endif #ifdef CONFIG_ARM64_GCS REGSET_GCS, #endif }; static const struct user_regset aarch64_regsets[] = { [REGSET_GPR] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_pt_regs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), .regset_get = gpr_get, .set = gpr_set }, [REGSET_FPR] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_fpsimd_state) / sizeof(u32), /* * We pretend we have 32-bit registers because the fpsr and * fpcr are 32-bits wide. */ .size = sizeof(u32), .align = sizeof(u32), .active = fpr_active, .regset_get = fpr_get, .set = fpr_set }, [REGSET_TLS] = { .core_note_type = NT_ARM_TLS, .n = 2, .size = sizeof(void *), .align = sizeof(void *), .regset_get = tls_get, .set = tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { .core_note_type = NT_ARM_HW_BREAK, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { .core_note_type = NT_ARM_HW_WATCH, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, #endif [REGSET_SYSTEM_CALL] = { .core_note_type = NT_ARM_SYSTEM_CALL, .n = 1, .size = sizeof(int), .align = sizeof(int), .regset_get = system_call_get, .set = system_call_set, }, [REGSET_FPMR] = { .core_note_type = NT_ARM_FPMR, .n = 1, .size = sizeof(u64), .align = sizeof(u64), .regset_get = fpmr_get, .set = fpmr_set, }, #ifdef CONFIG_ARM64_SVE [REGSET_SVE] = { /* Scalable Vector Extension */ .core_note_type = NT_ARM_SVE, .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, .regset_get = sve_get, .set = sve_set, }, #endif #ifdef CONFIG_ARM64_SME [REGSET_SSVE] = { /* Streaming mode SVE */ .core_note_type = NT_ARM_SSVE, .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, .regset_get = ssve_get, .set = ssve_set, }, [REGSET_ZA] = { /* SME ZA */ .core_note_type = NT_ARM_ZA, /* * ZA is a single register but it's variably sized and * the ptrace core requires that the size of any data * be an exact multiple of the configured register * size so report as though we had SVE_VQ_BYTES * registers. These values aren't exposed to * userspace. */ .n = DIV_ROUND_UP(ZA_PT_SIZE(SME_VQ_MAX), SVE_VQ_BYTES), .size = SVE_VQ_BYTES, .align = SVE_VQ_BYTES, .regset_get = za_get, .set = za_set, }, [REGSET_ZT] = { /* SME ZT */ .core_note_type = NT_ARM_ZT, .n = 1, .size = ZT_SIG_REG_BYTES, .align = sizeof(u64), .regset_get = zt_get, .set = zt_set, }, #endif #ifdef CONFIG_ARM64_PTR_AUTH [REGSET_PAC_MASK] = { .core_note_type = NT_ARM_PAC_MASK, .n = sizeof(struct user_pac_mask) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), .regset_get = pac_mask_get, /* this cannot be set dynamically */ }, [REGSET_PAC_ENABLED_KEYS] = { .core_note_type = NT_ARM_PAC_ENABLED_KEYS, .n = 1, .size = sizeof(long), .align = sizeof(long), .regset_get = pac_enabled_keys_get, .set = pac_enabled_keys_set, }, #ifdef CONFIG_CHECKPOINT_RESTORE [REGSET_PACA_KEYS] = { .core_note_type = NT_ARM_PACA_KEYS, .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), .regset_get = pac_address_keys_get, .set = pac_address_keys_set, }, [REGSET_PACG_KEYS] = { .core_note_type = NT_ARM_PACG_KEYS, .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), .size = sizeof(__uint128_t), .align = sizeof(__uint128_t), .regset_get = pac_generic_keys_get, .set = pac_generic_keys_set, }, #endif #endif #ifdef CONFIG_ARM64_TAGGED_ADDR_ABI [REGSET_TAGGED_ADDR_CTRL] = { .core_note_type = NT_ARM_TAGGED_ADDR_CTRL, .n = 1, .size = sizeof(long), .align = sizeof(long), .regset_get = tagged_addr_ctrl_get, .set = tagged_addr_ctrl_set, }, #endif #ifdef CONFIG_ARM64_POE [REGSET_POE] = { .core_note_type = NT_ARM_POE, .n = 1, .size = sizeof(long), .align = sizeof(long), .regset_get = poe_get, .set = poe_set, }, #endif #ifdef CONFIG_ARM64_GCS [REGSET_GCS] = { .core_note_type = NT_ARM_GCS, .n = sizeof(struct user_gcs) / sizeof(u64), .size = sizeof(u64), .align = sizeof(u64), .regset_get = gcs_get, .set = gcs_set, }, #endif }; static const struct user_regset_view user_aarch64_view = { .name = "aarch64", .e_machine = EM_AARCH64, .regsets = aarch64_regsets, .n = ARRAY_SIZE(aarch64_regsets) }; enum compat_regset { REGSET_COMPAT_GPR, REGSET_COMPAT_VFP, }; static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int idx) { struct pt_regs *regs = task_pt_regs(task); switch (idx) { case 15: return regs->pc; case 16: return pstate_to_compat_psr(regs->pstate); case 17: return regs->orig_x0; default: return regs->regs[idx]; } } static int compat_gpr_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int i = 0; while (to.left) membuf_store(&to, compat_get_user_reg(target, i++)); return 0; } static int compat_gpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct pt_regs newregs; int ret = 0; unsigned int i, start, num_regs; /* Calculate the number of AArch32 registers contained in count */ num_regs = count / regset->size; /* Convert pos into an register number */ start = pos / regset->size; if (start + num_regs > regset->n) return -EIO; newregs = *task_pt_regs(target); for (i = 0; i < num_regs; ++i) { unsigned int idx = start + i; compat_ulong_t reg; if (kbuf) { memcpy(&reg, kbuf, sizeof(reg)); kbuf += sizeof(reg); } else { ret = copy_from_user(&reg, ubuf, sizeof(reg)); if (ret) { ret = -EFAULT; break; } ubuf += sizeof(reg); } switch (idx) { case 15: newregs.pc = reg; break; case 16: reg = compat_psr_to_pstate(reg); newregs.pstate = reg; break; case 17: newregs.orig_x0 = reg; break; default: newregs.regs[idx] = reg; } } if (valid_user_regs(&newregs.user_regs, target)) *task_pt_regs(target) = newregs; else ret = -EINVAL; return ret; } static int compat_vfp_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; if (!system_supports_fpsimd()) return -EINVAL; uregs = &target->thread.uw.fpsimd_state; if (target == current) fpsimd_preserve_current_state(); /* * The VFP registers are packed into the fpsimd_state, so they all sit * nicely together for us. We just need to create the fpscr separately. */ membuf_write(&to, uregs, VFP_STATE_SIZE - sizeof(compat_ulong_t)); fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) | (uregs->fpcr & VFP_FPSCR_CTRL_MASK); return membuf_store(&to, fpscr); } static int compat_vfp_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { struct user_fpsimd_state *uregs; compat_ulong_t fpscr; int ret, vregs_end_pos; if (!system_supports_fpsimd()) return -EINVAL; uregs = &target->thread.uw.fpsimd_state; vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, uregs, 0, vregs_end_pos); if (count && !ret) { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpscr, vregs_end_pos, VFP_STATE_SIZE); if (!ret) { uregs->fpsr = fpscr & VFP_FPSCR_STAT_MASK; uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; } } fpsimd_flush_task_state(target); return ret; } static int compat_tls_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { return membuf_store(&to, (compat_ulong_t)target->thread.uw.tp_value); } static int compat_tls_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret; compat_ulong_t tls = target->thread.uw.tp_value; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1); if (ret) return ret; target->thread.uw.tp_value = tls; return ret; } static const struct user_regset aarch32_regsets[] = { [REGSET_COMPAT_GPR] = { .core_note_type = NT_PRSTATUS, .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_COMPAT_VFP] = { .core_note_type = NT_ARM_VFP, .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), .active = fpr_active, .regset_get = compat_vfp_get, .set = compat_vfp_set }, }; static const struct user_regset_view user_aarch32_view = { .name = "aarch32", .e_machine = EM_ARM, .regsets = aarch32_regsets, .n = ARRAY_SIZE(aarch32_regsets) }; static const struct user_regset aarch32_ptrace_regsets[] = { [REGSET_GPR] = { .core_note_type = NT_PRSTATUS, .n = COMPAT_ELF_NGREG, .size = sizeof(compat_elf_greg_t), .align = sizeof(compat_elf_greg_t), .regset_get = compat_gpr_get, .set = compat_gpr_set }, [REGSET_FPR] = { .core_note_type = NT_ARM_VFP, .n = VFP_STATE_SIZE / sizeof(compat_ulong_t), .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), .regset_get = compat_vfp_get, .set = compat_vfp_set }, [REGSET_TLS] = { .core_note_type = NT_ARM_TLS, .n = 1, .size = sizeof(compat_ulong_t), .align = sizeof(compat_ulong_t), .regset_get = compat_tls_get, .set = compat_tls_set, }, #ifdef CONFIG_HAVE_HW_BREAKPOINT [REGSET_HW_BREAK] = { .core_note_type = NT_ARM_HW_BREAK, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, [REGSET_HW_WATCH] = { .core_note_type = NT_ARM_HW_WATCH, .n = sizeof(struct user_hwdebug_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = hw_break_get, .set = hw_break_set, }, #endif [REGSET_SYSTEM_CALL] = { .core_note_type = NT_ARM_SYSTEM_CALL, .n = 1, .size = sizeof(int), .align = sizeof(int), .regset_get = system_call_get, .set = system_call_set, }, }; static const struct user_regset_view user_aarch32_ptrace_view = { .name = "aarch32", .e_machine = EM_ARM, .regsets = aarch32_ptrace_regsets, .n = ARRAY_SIZE(aarch32_ptrace_regsets) }; #ifdef CONFIG_COMPAT static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t __user *ret) { compat_ulong_t tmp; if (off & 3) return -EIO; if (off == COMPAT_PT_TEXT_ADDR) tmp = tsk->mm->start_code; else if (off == COMPAT_PT_DATA_ADDR) tmp = tsk->mm->start_data; else if (off == COMPAT_PT_TEXT_END_ADDR) tmp = tsk->mm->end_code; else if (off < sizeof(compat_elf_gregset_t)) tmp = compat_get_user_reg(tsk, off >> 2); else if (off >= COMPAT_USER_SZ) return -EIO; else tmp = 0; return put_user(tmp, ret); } static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off, compat_ulong_t val) { struct pt_regs newregs = *task_pt_regs(tsk); unsigned int idx = off / 4; if (off & 3 || off >= COMPAT_USER_SZ) return -EIO; if (off >= sizeof(compat_elf_gregset_t)) return 0; switch (idx) { case 15: newregs.pc = val; break; case 16: newregs.pstate = compat_psr_to_pstate(val); break; case 17: newregs.orig_x0 = val; break; default: newregs.regs[idx] = val; } if (!valid_user_regs(&newregs.user_regs, tsk)) return -EINVAL; *task_pt_regs(tsk) = newregs; return 0; } #ifdef CONFIG_HAVE_HW_BREAKPOINT /* * Convert a virtual register number into an index for a thread_info * breakpoint array. Breakpoints are identified using positive numbers * whilst watchpoints are negative. The registers are laid out as pairs * of (address, control), each pair mapping to a unique hw_breakpoint struct. * Register 0 is reserved for describing resource information. */ static int compat_ptrace_hbp_num_to_idx(compat_long_t num) { return (abs(num) - 1) >> 1; } static int compat_ptrace_hbp_get_resource_info(u32 *kdata) { u8 num_brps, num_wrps, debug_arch, wp_len; u32 reg = 0; num_brps = hw_breakpoint_slots(TYPE_INST); num_wrps = hw_breakpoint_slots(TYPE_DATA); debug_arch = debug_monitors_arch(); wp_len = 8; reg |= debug_arch; reg <<= 8; reg |= wp_len; reg <<= 8; reg |= num_wrps; reg <<= 8; reg |= num_brps; *kdata = reg; return 0; } static int compat_ptrace_hbp_get(unsigned int note_type, struct task_struct *tsk, compat_long_t num, u32 *kdata) { u64 addr = 0; u32 ctrl = 0; int err, idx = compat_ptrace_hbp_num_to_idx(num); if (num & 1) { err = ptrace_hbp_get_addr(note_type, tsk, idx, &addr); *kdata = (u32)addr; } else { err = ptrace_hbp_get_ctrl(note_type, tsk, idx, &ctrl); *kdata = ctrl; } return err; } static int compat_ptrace_hbp_set(unsigned int note_type, struct task_struct *tsk, compat_long_t num, u32 *kdata) { u64 addr; u32 ctrl; int err, idx = compat_ptrace_hbp_num_to_idx(num); if (num & 1) { addr = *kdata; err = ptrace_hbp_set_addr(note_type, tsk, idx, addr); } else { ctrl = *kdata; err = ptrace_hbp_set_ctrl(note_type, tsk, idx, ctrl); } return err; } static int compat_ptrace_gethbpregs(struct task_struct *tsk, compat_long_t num, compat_ulong_t __user *data) { int ret; u32 kdata; /* Watchpoint */ if (num < 0) { ret = compat_ptrace_hbp_get(NT_ARM_HW_WATCH, tsk, num, &kdata); /* Resource info */ } else if (num == 0) { ret = compat_ptrace_hbp_get_resource_info(&kdata); /* Breakpoint */ } else { ret = compat_ptrace_hbp_get(NT_ARM_HW_BREAK, tsk, num, &kdata); } if (!ret) ret = put_user(kdata, data); return ret; } static int compat_ptrace_sethbpregs(struct task_struct *tsk, compat_long_t num, compat_ulong_t __user *data) { int ret; u32 kdata = 0; if (num == 0) return 0; ret = get_user(kdata, data); if (ret) return ret; if (num < 0) ret = compat_ptrace_hbp_set(NT_ARM_HW_WATCH, tsk, num, &kdata); else ret = compat_ptrace_hbp_set(NT_ARM_HW_BREAK, tsk, num, &kdata); return ret; } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; switch (request) { case PTRACE_PEEKUSR: ret = compat_ptrace_read_user(child, addr, datap); break; case PTRACE_POKEUSR: ret = compat_ptrace_write_user(child, addr, data); break; case COMPAT_PTRACE_GETREGS: ret = copy_regset_to_user(child, &user_aarch32_view, REGSET_COMPAT_GPR, 0, sizeof(compat_elf_gregset_t), datap); break; case COMPAT_PTRACE_SETREGS: ret = copy_regset_from_user(child, &user_aarch32_view, REGSET_COMPAT_GPR, 0, sizeof(compat_elf_gregset_t), datap); break; case COMPAT_PTRACE_GET_THREAD_AREA: ret = put_user((compat_ulong_t)child->thread.uw.tp_value, (compat_ulong_t __user *)datap); break; case COMPAT_PTRACE_SET_SYSCALL: task_pt_regs(child)->syscallno = data; ret = 0; break; case COMPAT_PTRACE_GETVFPREGS: ret = copy_regset_to_user(child, &user_aarch32_view, REGSET_COMPAT_VFP, 0, VFP_STATE_SIZE, datap); break; case COMPAT_PTRACE_SETVFPREGS: ret = copy_regset_from_user(child, &user_aarch32_view, REGSET_COMPAT_VFP, 0, VFP_STATE_SIZE, datap); break; #ifdef CONFIG_HAVE_HW_BREAKPOINT case COMPAT_PTRACE_GETHBPREGS: ret = compat_ptrace_gethbpregs(child, addr, datap); break; case COMPAT_PTRACE_SETHBPREGS: ret = compat_ptrace_sethbpregs(child, addr, datap); break; #endif default: ret = compat_ptrace_request(child, request, addr, data); break; } return ret; } #endif /* CONFIG_COMPAT */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { /* * Core dumping of 32-bit tasks or compat ptrace requests must use the * user_aarch32_view compatible with arm32. Native ptrace requests on * 32-bit children use an extended user_aarch32_ptrace_view to allow * access to the TLS register. */ if (is_compat_task()) return &user_aarch32_view; else if (is_compat_thread(task_thread_info(task))) return &user_aarch32_ptrace_view; return &user_aarch64_view; } long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { switch (request) { case PTRACE_PEEKMTETAGS: case PTRACE_POKEMTETAGS: return mte_ptrace_copy_tags(child, request, addr, data); } return ptrace_request(child, request, addr, data); } enum ptrace_syscall_dir { PTRACE_SYSCALL_ENTER = 0, PTRACE_SYSCALL_EXIT, }; static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir) { int regno; unsigned long saved_reg; /* * We have some ABI weirdness here in the way that we handle syscall * exit stops because we indicate whether or not the stop has been * signalled from syscall entry or syscall exit by clobbering a general * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee * and restoring its old value after the stop. This means that: * * - Any writes by the tracer to this register during the stop are * ignored/discarded. * * - The actual value of the register is not available during the stop, * so the tracer cannot save it and restore it later. * * - Syscall stops behave differently to seccomp and pseudo-step traps * (the latter do not nobble any registers). */ regno = (is_compat_task() ? 12 : 7); saved_reg = regs->regs[regno]; regs->regs[regno] = dir; if (dir == PTRACE_SYSCALL_ENTER) { if (ptrace_report_syscall_entry(regs)) forget_syscall(regs); regs->regs[regno] = saved_reg; } else if (!test_thread_flag(TIF_SINGLESTEP)) { ptrace_report_syscall_exit(regs, 0); regs->regs[regno] = saved_reg; } else { regs->regs[regno] = saved_reg; /* * Signal a pseudo-step exception since we are stepping but * tracer modifications to the registers may have rewound the * state machine. */ ptrace_report_syscall_exit(regs, 1); } } int syscall_trace_enter(struct pt_regs *regs) { unsigned long flags = read_thread_flags(); if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) { report_syscall(regs, PTRACE_SYSCALL_ENTER); if (flags & _TIF_SYSCALL_EMU) return NO_SYSCALL; } /* Do the secure computing after ptrace; failures should be fast. */ if (secure_computing() == -1) return NO_SYSCALL; if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, regs->syscallno); audit_syscall_entry(regs->syscallno, regs->orig_x0, regs->regs[1], regs->regs[2], regs->regs[3]); return regs->syscallno; } void syscall_trace_exit(struct pt_regs *regs) { unsigned long flags = read_thread_flags(); audit_syscall_exit(regs); if (flags & _TIF_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP)) report_syscall(regs, PTRACE_SYSCALL_EXIT); rseq_syscall(regs); } /* * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a. * We permit userspace to set SSBS (AArch64 bit 12, AArch32 bit 23) which is * not described in ARM DDI 0487D.a. * We treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may * be allocated an EL0 meaning in future. * Userspace cannot use these until they have an architectural meaning. * Note that this follows the SPSR_ELx format, not the AArch32 PSR format. * We also reserve IL for the kernel; SS is handled dynamically. */ #define SPSR_EL1_AARCH64_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \ GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) static int valid_compat_regs(struct user_pt_regs *regs) { regs->pstate &= ~SPSR_EL1_AARCH32_RES0_BITS; if (!system_supports_mixed_endian_el0()) { if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) regs->pstate |= PSR_AA32_E_BIT; else regs->pstate &= ~PSR_AA32_E_BIT; } if (user_mode(regs) && (regs->pstate & PSR_MODE32_BIT) && (regs->pstate & PSR_AA32_A_BIT) == 0 && (regs->pstate & PSR_AA32_I_BIT) == 0 && (regs->pstate & PSR_AA32_F_BIT) == 0) { return 1; } /* * Force PSR to a valid 32-bit EL0t, preserving the same bits as * arch/arm. */ regs->pstate &= PSR_AA32_N_BIT | PSR_AA32_Z_BIT | PSR_AA32_C_BIT | PSR_AA32_V_BIT | PSR_AA32_Q_BIT | PSR_AA32_IT_MASK | PSR_AA32_GE_MASK | PSR_AA32_E_BIT | PSR_AA32_T_BIT; regs->pstate |= PSR_MODE32_BIT; return 0; } static int valid_native_regs(struct user_pt_regs *regs) { regs->pstate &= ~SPSR_EL1_AARCH64_RES0_BITS; if (user_mode(regs) && !(regs->pstate & PSR_MODE32_BIT) && (regs->pstate & PSR_D_BIT) == 0 && (regs->pstate & PSR_A_BIT) == 0 && (regs->pstate & PSR_I_BIT) == 0 && (regs->pstate & PSR_F_BIT) == 0) { return 1; } /* Force PSR to a valid 64-bit EL0t */ regs->pstate &= PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT; return 0; } /* * Are the current registers suitable for user mode? (used to maintain * security in signal handlers) */ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task) { /* https://lore.kernel.org/lkml/20191118131525.GA4180@willie-the-truck */ user_regs_reset_single_step(regs, task); if (is_compat_thread(task_thread_info(task))) return valid_compat_regs(regs); else return valid_native_regs(regs); }
766 566 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 /* SPDX-License-Identifier: GPL-2.0 */ /* rwsem.h: R/W semaphores, public interface * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h */ #ifndef _LINUX_RWSEM_H #define _LINUX_RWSEM_H #include <linux/linkage.h> #include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/cleanup.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ }, #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif #ifndef CONFIG_PREEMPT_RT #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif /* * For an uncontended rwsem, count and owner are the only fields a task * needs to touch when acquiring the rwsem. So they are put next to each * other to increase the chance that they will share the same cacheline. * * In a contended rwsem, the owner is likely the most frequently accessed * field in the structure as the optimistic waiter that holds the osq lock * will spin on owner. For an embedded rwsem, other hot fields in the * containing structure should be moved further away from the rwsem to * reduce the chance that they will share the same cacheline causing * cacheline bouncing problem. */ struct rw_semaphore { atomic_long_t count; /* * Write owner or one of the read owners as well flags regarding * the current state of the rwsem. Can be used as a speculative * check to see if the write owner is running on the cpu. */ atomic_long_t owner; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define RWSEM_UNLOCKED_VALUE 0UL #define RWSEM_WRITER_LOCKED (1UL << 0) #define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) static inline int rwsem_is_locked(struct rw_semaphore *sem) { return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE; } static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE); } static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED)); } /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_RWSEMS # define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else # define __RWSEM_DEBUG_INIT(lockname) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED, #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ { __RWSEM_COUNT_INIT(name), \ .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ .wait_list = LIST_HEAD_INIT((name).wait_list), \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) extern void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) /* * This is the same regardless of which rwsem implementation that is being used. * It is just a heuristic meant to be called by somebody already holding the * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ static inline int rwsem_is_contended(struct rw_semaphore *sem) { return !list_empty(&sem->wait_list); } #else /* !CONFIG_PREEMPT_RT */ #include <linux/rwbase_rt.h> struct rw_semaphore { struct rwbase_rt rwbase; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define __RWSEM_INITIALIZER(name) \ { \ .rwbase = __RWBASE_INITIALIZER(name), \ __RWSEM_DEP_MAP_INIT(name) \ } #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rwsem_is_locked(sem)); } static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) { WARN_ON(!rw_base_is_write_locked(&sem->rwbase)); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); } #endif /* CONFIG_PREEMPT_RT */ /* * The functions below are the same for all rwsem implementations including * the RT specific variant. */ static inline void rwsem_assert_held(const struct rw_semaphore *sem) { if (IS_ENABLED(CONFIG_LOCKDEP)) lockdep_assert_held(sem); else rwsem_assert_held_nolockdep(sem); } static inline void rwsem_assert_held_write(const struct rw_semaphore *sem) { if (IS_ENABLED(CONFIG_LOCKDEP)) lockdep_assert_held_write(sem); else rwsem_assert_held_write_nolockdep(sem); } /* * lock for reading */ extern void down_read(struct rw_semaphore *sem); extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ extern int down_read_trylock(struct rw_semaphore *sem); /* * lock for writing */ extern void down_write(struct rw_semaphore *sem); extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ extern int down_write_trylock(struct rw_semaphore *sem); /* * release a read lock */ extern void up_read(struct rw_semaphore *sem); /* * release a write lock */ extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T)) DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0) DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T)) /* * downgrade write lock to read lock */ extern void downgrade_write(struct rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * nested locking. NOTE: rwsems are not allowed to recurse * (which occurs if the same task tries to acquire the same * lock instance multiple times), but multiple locks of the * same lock class might be taken, if the order of the locks * is always the same. This ordering rule can be expressed * to lockdep via the _nested() APIs, but enumerating the * subclasses that are used. (If the nesting relationship is * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. * See Documentation/locking/lockdep-design.rst for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); # define down_write_nest_lock(sem, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ } while (0) /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) # define down_read_non_owner(sem) down_read(sem) # define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */
33 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 // SPDX-License-Identifier: GPL-2.0-only /* * Based on arch/arm/kernel/traps.c * * Copyright (C) 1995-2009 Russell King * Copyright (C) 2012 ARM Ltd. */ #include <linux/bug.h> #include <linux/context_tracking.h> #include <linux/signal.h> #include <linux/kallsyms.h> #include <linux/kprobes.h> #include <linux/spinlock.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> #include <linux/module.h> #include <linux/kexec.h> #include <linux/delay.h> #include <linux/efi.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/sizes.h> #include <linux/syscalls.h> #include <linux/mm_types.h> #include <linux/kasan.h> #include <linux/ubsan.h> #include <linux/cfi.h> #include <asm/atomic.h> #include <asm/bug.h> #include <asm/cpufeature.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> #include <asm/efi.h> #include <asm/esr.h> #include <asm/exception.h> #include <asm/extable.h> #include <asm/insn.h> #include <asm/kprobes.h> #include <asm/text-patching.h> #include <asm/traps.h> #include <asm/smp.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/system_misc.h> #include <asm/sysreg.h> static bool __kprobes __check_eq(unsigned long pstate) { return (pstate & PSR_Z_BIT) != 0; } static bool __kprobes __check_ne(unsigned long pstate) { return (pstate & PSR_Z_BIT) == 0; } static bool __kprobes __check_cs(unsigned long pstate) { return (pstate & PSR_C_BIT) != 0; } static bool __kprobes __check_cc(unsigned long pstate) { return (pstate & PSR_C_BIT) == 0; } static bool __kprobes __check_mi(unsigned long pstate) { return (pstate & PSR_N_BIT) != 0; } static bool __kprobes __check_pl(unsigned long pstate) { return (pstate & PSR_N_BIT) == 0; } static bool __kprobes __check_vs(unsigned long pstate) { return (pstate & PSR_V_BIT) != 0; } static bool __kprobes __check_vc(unsigned long pstate) { return (pstate & PSR_V_BIT) == 0; } static bool __kprobes __check_hi(unsigned long pstate) { pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ return (pstate & PSR_C_BIT) != 0; } static bool __kprobes __check_ls(unsigned long pstate) { pstate &= ~(pstate >> 1); /* PSR_C_BIT &= ~PSR_Z_BIT */ return (pstate & PSR_C_BIT) == 0; } static bool __kprobes __check_ge(unsigned long pstate) { pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ return (pstate & PSR_N_BIT) == 0; } static bool __kprobes __check_lt(unsigned long pstate) { pstate ^= (pstate << 3); /* PSR_N_BIT ^= PSR_V_BIT */ return (pstate & PSR_N_BIT) != 0; } static bool __kprobes __check_gt(unsigned long pstate) { /*PSR_N_BIT ^= PSR_V_BIT */ unsigned long temp = pstate ^ (pstate << 3); temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ return (temp & PSR_N_BIT) == 0; } static bool __kprobes __check_le(unsigned long pstate) { /*PSR_N_BIT ^= PSR_V_BIT */ unsigned long temp = pstate ^ (pstate << 3); temp |= (pstate << 1); /*PSR_N_BIT |= PSR_Z_BIT */ return (temp & PSR_N_BIT) != 0; } static bool __kprobes __check_al(unsigned long pstate) { return true; } /* * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that * it behaves identically to 0b1110 ("al"). */ pstate_check_t * const aarch32_opcode_cond_checks[16] = { __check_eq, __check_ne, __check_cs, __check_cc, __check_mi, __check_pl, __check_vs, __check_vc, __check_hi, __check_ls, __check_ge, __check_lt, __check_gt, __check_le, __check_al, __check_al }; int show_unhandled_signals = 0; static void dump_kernel_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; if (user_mode(regs)) return; for (i = -4; i < 1; i++) { unsigned int val, bad; bad = aarch64_insn_read(&((u32 *)addr)[i], &val); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); else p += sprintf(p, i == 0 ? "(????????) " : "???????? "); } printk("%sCode: %s\n", lvl, str); } #define S_SMP " SMP" static int __die(const char *str, long err, struct pt_regs *regs) { static int die_counter; int ret; pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n", str, err, ++die_counter); /* trap and error numbers are mostly meaningless on ARM */ ret = notify_die(DIE_OOPS, str, regs, err, 0, SIGSEGV); if (ret == NOTIFY_STOP) return ret; print_modules(); show_regs(regs); dump_kernel_instr(KERN_EMERG, regs); return ret; } static DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. */ void die(const char *str, struct pt_regs *regs, long err) { int ret; unsigned long flags; raw_spin_lock_irqsave(&die_lock, flags); oops_enter(); console_verbose(); bust_spinlocks(1); ret = __die(str, err, regs); if (regs && kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); oops_exit(); if (in_interrupt()) panic("%s: Fatal exception in interrupt", str); if (panic_on_oops) panic("%s: Fatal exception", str); raw_spin_unlock_irqrestore(&die_lock, flags); if (ret != NOTIFY_STOP) make_task_dead(SIGSEGV); } static void arm64_show_signal(int signo, const char *str) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct task_struct *tsk = current; unsigned long esr = tsk->thread.fault_code; struct pt_regs *regs = task_pt_regs(tsk); /* Leave if the signal won't be shown */ if (!show_unhandled_signals || !unhandled_signal(tsk, signo) || !__ratelimit(&rs)) return; pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk)); if (esr) pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr); pr_cont("%s", str); print_vma_addr(KERN_CONT " in ", regs->pc); pr_cont("\n"); __show_regs(regs); } void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); if (signo == SIGKILL) force_sig(SIGKILL); else force_sig_fault(signo, code, (void __user *)far); } void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey) { arm64_show_signal(SIGSEGV, str); force_sig_pkuerr((void __user *)far, pkey); } void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); force_sig_mceerr(code, (void __user *)far, lsb); } void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str) { arm64_show_signal(SIGTRAP, str); force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, int signo, int sicode, unsigned long far, unsigned long err) { if (user_mode(regs)) { WARN_ON(regs != current_pt_regs()); current->thread.fault_address = 0; current->thread.fault_code = err; arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } } #ifdef CONFIG_COMPAT #define PSTATE_IT_1_0_SHIFT 25 #define PSTATE_IT_1_0_MASK (0x3 << PSTATE_IT_1_0_SHIFT) #define PSTATE_IT_7_2_SHIFT 10 #define PSTATE_IT_7_2_MASK (0x3f << PSTATE_IT_7_2_SHIFT) static u32 compat_get_it_state(struct pt_regs *regs) { u32 it, pstate = regs->pstate; it = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT; it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2; return it; } static void compat_set_it_state(struct pt_regs *regs, u32 it) { u32 pstate_it; pstate_it = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK; pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK; regs->pstate &= ~PSR_AA32_IT_MASK; regs->pstate |= pstate_it; } static void advance_itstate(struct pt_regs *regs) { u32 it; /* ARM mode */ if (!(regs->pstate & PSR_AA32_T_BIT) || !(regs->pstate & PSR_AA32_IT_MASK)) return; it = compat_get_it_state(regs); /* * If this is the last instruction of the block, wipe the IT * state. Otherwise advance it. */ if (!(it & 7)) it = 0; else it = (it & 0xe0) | ((it << 1) & 0x1f); compat_set_it_state(regs, it); } #else static void advance_itstate(struct pt_regs *regs) { } #endif void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size) { regs->pc += size; /* * If we were single stepping, we want to get the step exception after * we return from the trap. */ if (user_mode(regs)) user_fastforward_single_step(current); if (compat_user_mode(regs)) advance_itstate(regs); else regs->pstate &= ~PSR_BTYPE_MASK; } static int user_insn_read(struct pt_regs *regs, u32 *insnp) { u32 instr; unsigned long pc = instruction_pointer(regs); if (compat_thumb_mode(regs)) { /* 16-bit Thumb instruction */ __le16 instr_le; if (get_user(instr_le, (__le16 __user *)pc)) return -EFAULT; instr = le16_to_cpu(instr_le); if (aarch32_insn_is_wide(instr)) { u32 instr2; if (get_user(instr_le, (__le16 __user *)(pc + 2))) return -EFAULT; instr2 = le16_to_cpu(instr_le); instr = (instr << 16) | instr2; } } else { /* 32-bit ARM instruction */ __le32 instr_le; if (get_user(instr_le, (__le32 __user *)pc)) return -EFAULT; instr = le32_to_cpu(instr_le); } *insnp = instr; return 0; } void force_signal_inject(int signal, int code, unsigned long address, unsigned long err) { const char *desc; struct pt_regs *regs = current_pt_regs(); if (WARN_ON(!user_mode(regs))) return; switch (signal) { case SIGILL: desc = "undefined instruction"; break; case SIGSEGV: desc = "illegal memory access"; break; default: desc = "unknown or unrecoverable error"; break; } /* Force signals we don't understand to SIGKILL */ if (WARN_ON(signal != SIGKILL && siginfo_layout(signal, code) != SIL_FAULT)) { signal = SIGKILL; } arm64_notify_die(desc, regs, signal, code, address, err); } /* * Set up process info to signal segmentation fault - called on access error. */ void arm64_notify_segfault(unsigned long addr) { int code; mmap_read_lock(current->mm); if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; mmap_read_unlock(current->mm); force_signal_inject(SIGSEGV, code, addr, 0); } void do_el0_undef(struct pt_regs *regs, unsigned long esr) { u32 insn; /* check for AArch32 breakpoint instructions */ if (!aarch32_break_handler(regs)) return; if (user_insn_read(regs, &insn)) goto out_err; if (try_emulate_mrs(regs, insn)) return; if (try_emulate_armv8_deprecated(regs, insn)) return; out_err: force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } void do_el1_undef(struct pt_regs *regs, unsigned long esr) { u32 insn; if (aarch64_insn_read((void *)regs->pc, &insn)) goto out_err; if (try_emulate_el1_ssbs(regs, insn)) return; out_err: die("Oops - Undefined instruction", regs, esr); } void do_el0_bti(struct pt_regs *regs) { force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } void do_el1_bti(struct pt_regs *regs, unsigned long esr) { if (efi_runtime_fixup_exception(regs, "BTI violation")) { regs->pstate &= ~PSR_BTYPE_MASK; return; } die("Oops - BTI", regs, esr); } void do_el0_gcs(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0); } void do_el1_gcs(struct pt_regs *regs, unsigned long esr) { die("Oops - GCS", regs, esr); } void do_el0_fpac(struct pt_regs *regs, unsigned long esr) { force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr); } void do_el1_fpac(struct pt_regs *regs, unsigned long esr) { /* * Unexpected FPAC exception in the kernel: kill the task before it * does any more harm. */ die("Oops - FPAC", regs, esr); } void do_el0_mops(struct pt_regs *regs, unsigned long esr) { arm64_mops_reset_regs(&regs->user_regs, esr); /* * If single stepping then finish the step before executing the * prologue instruction. */ user_fastforward_single_step(current); } void do_el1_mops(struct pt_regs *regs, unsigned long esr) { arm64_mops_reset_regs(&regs->user_regs, esr); kernel_fastforward_single_step(regs); } #define __user_cache_maint(insn, address, res) \ if (address >= TASK_SIZE_MAX) { \ res = -EFAULT; \ } else { \ uaccess_ttbr0_enable(); \ asm volatile ( \ "1: " insn ", %1\n" \ " mov %w0, #0\n" \ "2:\n" \ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) \ : "=r" (res) \ : "r" (address)); \ uaccess_ttbr0_disable(); \ } static void user_cache_maint_handler(unsigned long esr, struct pt_regs *regs) { unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; tagged_address = pt_regs_read_reg(regs, rt); address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ __user_cache_maint("dc civac", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CVADP: /* DC CVADP */ __user_cache_maint("sys 3, c7, c13, 1", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ __user_cache_maint("sys 3, c7, c12, 1", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_DC_CIVAC: /* DC CIVAC */ __user_cache_maint("dc civac", address, ret); break; case ESR_ELx_SYS64_ISS_CRM_IC_IVAU: /* IC IVAU */ __user_cache_maint("ic ivau", address, ret); break; default: force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } if (ret) arm64_notify_segfault(tagged_address); else arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } static void ctr_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0); if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) { /* Hide DIC so that we can trap the unnecessary maintenance...*/ val &= ~BIT(CTR_EL0_DIC_SHIFT); /* ... and fake IminLine to reduce the number of traps. */ val &= ~CTR_EL0_IminLine_MASK; val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK; } pt_regs_write_reg(regs, rt, val); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { if (test_thread_flag(TIF_TSC_SIGSEGV)) { force_sig(SIGSEGV); } else { int rt = ESR_ELx_SYS64_ISS_RT(esr); pt_regs_write_reg(regs, rt, arch_timer_read_counter()); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } } static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { if (test_thread_flag(TIF_TSC_SIGSEGV)) { force_sig(SIGSEGV); } else { int rt = ESR_ELx_SYS64_ISS_RT(esr); pt_regs_write_reg(regs, rt, arch_timer_get_rate()); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } } static void mrs_handler(unsigned long esr, struct pt_regs *regs) { u32 sysreg, rt; rt = ESR_ELx_SYS64_ISS_RT(esr); sysreg = esr_sys64_to_sysreg(esr); if (do_emulate_mrs(regs, sysreg, rt) != 0) force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); } static void wfi_handler(unsigned long esr, struct pt_regs *regs) { arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } struct sys64_hook { unsigned long esr_mask; unsigned long esr_val; void (*handler)(unsigned long esr, struct pt_regs *regs); }; static const struct sys64_hook sys64_hooks[] = { { .esr_mask = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL, .handler = user_cache_maint_handler, }, { /* Trap read access to CTR_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CTR_READ, .handler = ctr_read_handler, }, { /* Trap read access to CNTVCT_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCT, .handler = cntvct_read_handler, }, { /* Trap read access to CNTVCTSS_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCTSS, .handler = cntvct_read_handler, }, { /* Trap read access to CNTFRQ_EL0 */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ, .handler = cntfrq_read_handler, }, { /* Trap read access to CPUID registers */ .esr_mask = ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK, .esr_val = ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL, .handler = mrs_handler, }, { /* Trap WFI instructions executed in userspace */ .esr_mask = ESR_ELx_WFx_MASK, .esr_val = ESR_ELx_WFx_WFI_VAL, .handler = wfi_handler, }, {}, }; #ifdef CONFIG_COMPAT static bool cp15_cond_valid(unsigned long esr, struct pt_regs *regs) { int cond; /* Only a T32 instruction can trap without CV being set */ if (!(esr & ESR_ELx_CV)) { u32 it; it = compat_get_it_state(regs); if (!it) return true; cond = it >> 4; } else { cond = (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT; } return aarch32_opcode_cond_checks[cond](regs->pstate); } static void compat_cntfrq_read_handler(unsigned long esr, struct pt_regs *regs) { int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT; pt_regs_write_reg(regs, reg, arch_timer_get_rate()); arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_32_hooks[] = { { .esr_mask = ESR_ELx_CP15_32_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_32_ISS_SYS_CNTFRQ, .handler = compat_cntfrq_read_handler, }, {}, }; static void compat_cntvct_read_handler(unsigned long esr, struct pt_regs *regs) { int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; u64 val = arch_timer_read_counter(); pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); arm64_skip_faulting_instruction(regs, 4); } static const struct sys64_hook cp15_64_hooks[] = { { .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCT, .handler = compat_cntvct_read_handler, }, { .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK, .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS, .handler = compat_cntvct_read_handler, }, {}, }; void do_el0_cp15(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook, *hook_base; if (!cp15_cond_valid(esr, regs)) { /* * There is no T16 variant of a CP access, so we * always advance PC by 4 bytes. */ arm64_skip_faulting_instruction(regs, 4); return; } switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_CP15_32: hook_base = cp15_32_hooks; break; case ESR_ELx_EC_CP15_64: hook_base = cp15_64_hooks; break; default: do_el0_undef(regs, esr); return; } for (hook = hook_base; hook->handler; hook++) if ((hook->esr_mask & esr) == hook->esr_val) { hook->handler(esr, regs); return; } /* * New cp15 instructions may previously have been undefined at * EL0. Fall back to our usual undefined instruction handler * so that we handle these consistently. */ do_el0_undef(regs, esr); } #endif void do_el0_sys(unsigned long esr, struct pt_regs *regs) { const struct sys64_hook *hook; for (hook = sys64_hooks; hook->handler; hook++) if ((hook->esr_mask & esr) == hook->esr_val) { hook->handler(esr, regs); return; } /* * New SYS instructions may previously have been undefined at EL0. Fall * back to our usual undefined instruction handler so that we handle * these consistently. */ do_el0_undef(regs, esr); } static const char *esr_class_str[] = { [0 ... ESR_ELx_EC_MAX] = "UNRECOGNIZED EC", [ESR_ELx_EC_UNKNOWN] = "Unknown/Uncategorized", [ESR_ELx_EC_WFx] = "WFI/WFE", [ESR_ELx_EC_CP15_32] = "CP15 MCR/MRC", [ESR_ELx_EC_CP15_64] = "CP15 MCRR/MRRC", [ESR_ELx_EC_CP14_MR] = "CP14 MCR/MRC", [ESR_ELx_EC_CP14_LS] = "CP14 LDC/STC", [ESR_ELx_EC_FP_ASIMD] = "ASIMD", [ESR_ELx_EC_CP10_ID] = "CP10 MRC/VMRS", [ESR_ELx_EC_PAC] = "PAC", [ESR_ELx_EC_CP14_64] = "CP14 MCRR/MRRC", [ESR_ELx_EC_BTI] = "BTI", [ESR_ELx_EC_ILL] = "PSTATE.IL", [ESR_ELx_EC_SVC32] = "SVC (AArch32)", [ESR_ELx_EC_HVC32] = "HVC (AArch32)", [ESR_ELx_EC_SMC32] = "SMC (AArch32)", [ESR_ELx_EC_SVC64] = "SVC (AArch64)", [ESR_ELx_EC_HVC64] = "HVC (AArch64)", [ESR_ELx_EC_SMC64] = "SMC (AArch64)", [ESR_ELx_EC_SYS64] = "MSR/MRS (AArch64)", [ESR_ELx_EC_SVE] = "SVE", [ESR_ELx_EC_ERET] = "ERET/ERETAA/ERETAB", [ESR_ELx_EC_FPAC] = "FPAC", [ESR_ELx_EC_SME] = "SME", [ESR_ELx_EC_IMP_DEF] = "EL3 IMP DEF", [ESR_ELx_EC_IABT_LOW] = "IABT (lower EL)", [ESR_ELx_EC_IABT_CUR] = "IABT (current EL)", [ESR_ELx_EC_PC_ALIGN] = "PC Alignment", [ESR_ELx_EC_DABT_LOW] = "DABT (lower EL)", [ESR_ELx_EC_DABT_CUR] = "DABT (current EL)", [ESR_ELx_EC_SP_ALIGN] = "SP Alignment", [ESR_ELx_EC_MOPS] = "MOPS", [ESR_ELx_EC_FP_EXC32] = "FP (AArch32)", [ESR_ELx_EC_FP_EXC64] = "FP (AArch64)", [ESR_ELx_EC_GCS] = "Guarded Control Stack", [ESR_ELx_EC_SERROR] = "SError", [ESR_ELx_EC_BREAKPT_LOW] = "Breakpoint (lower EL)", [ESR_ELx_EC_BREAKPT_CUR] = "Breakpoint (current EL)", [ESR_ELx_EC_SOFTSTP_LOW] = "Software Step (lower EL)", [ESR_ELx_EC_SOFTSTP_CUR] = "Software Step (current EL)", [ESR_ELx_EC_WATCHPT_LOW] = "Watchpoint (lower EL)", [ESR_ELx_EC_WATCHPT_CUR] = "Watchpoint (current EL)", [ESR_ELx_EC_BKPT32] = "BKPT (AArch32)", [ESR_ELx_EC_VECTOR32] = "Vector catch (AArch32)", [ESR_ELx_EC_BRK64] = "BRK (AArch64)", }; const char *esr_get_class_string(unsigned long esr) { return esr_class_str[ESR_ELx_EC(esr)]; } /* * bad_el0_sync handles unexpected, but potentially recoverable synchronous * exceptions taken from EL0. */ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr) { unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; current->thread.fault_code = esr; arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc, "Bad EL0 synchronous exception"); } #ifdef CONFIG_VMAP_STACK DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); console_verbose(); pr_emerg("Insufficient stack space to handle exception!"); pr_emerg("ESR: 0x%016lx -- %s\n", esr, esr_get_class_string(esr)); pr_emerg("FAR: 0x%016lx\n", far); pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", irq_stk, irq_stk + IRQ_STACK_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); __show_regs(regs); /* * We use nmi_panic to limit the potential for recusive overflows, and * to get a better stack trace. */ nmi_panic(NULL, "kernel stack overflow"); cpu_park_loop(); } #endif void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr) { console_verbose(); pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); if (regs) __show_regs(regs); nmi_panic(regs, "Asynchronous SError Interrupt"); cpu_park_loop(); } bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { unsigned long aet = arm64_ras_serror_get_severity(esr); switch (aet) { case ESR_ELx_AET_CE: /* corrected error */ case ESR_ELx_AET_UEO: /* restartable, not yet consumed */ /* * The CPU can make progress. We may take UEO again as * a more severe error. */ return false; case ESR_ELx_AET_UEU: /* Uncorrected Unrecoverable */ case ESR_ELx_AET_UER: /* Uncorrected Recoverable */ /* * The CPU can't make progress. The exception may have * been imprecise. * * Neoverse-N1 #1349291 means a non-KVM SError reported as * Unrecoverable should be treated as Uncontainable. We * call arm64_serror_panic() in both cases. */ return true; case ESR_ELx_AET_UC: /* Uncontainable or Uncategorized error */ default: /* Error has been silently propagated */ arm64_serror_panic(regs, esr); } } void do_serror(struct pt_regs *regs, unsigned long esr) { /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); } /* GENERIC_BUG traps */ #ifdef CONFIG_GENERIC_BUG int is_valid_bugaddr(unsigned long addr) { /* * bug_handler() only called for BRK #BUG_BRK_IMM. * So the answer is trivial -- any spurious instances with no * bug table entry will be rejected by report_bug() and passed * back to the debug-monitors code and handled as a fatal * unexpected debug exception. */ return 1; } #endif static int bug_handler(struct pt_regs *regs, unsigned long esr) { switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: die("Oops - BUG", regs, esr); break; case BUG_TRAP_TYPE_WARN: break; default: /* unknown/unrecognised bug trap type */ return DBG_HOOK_ERROR; } /* If thread survives, skip over the BUG instruction and continue: */ arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } static struct break_hook bug_break_hook = { .fn = bug_handler, .imm = BUG_BRK_IMM, }; #ifdef CONFIG_CFI_CLANG static int cfi_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; u32 type; target = pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TARGET, esr)); type = (u32)pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TYPE, esr)); switch (report_cfi_failure(regs, regs->pc, &target, type)) { case BUG_TRAP_TYPE_BUG: die("Oops - CFI", regs, esr); break; case BUG_TRAP_TYPE_WARN: break; default: return DBG_HOOK_ERROR; } arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } static struct break_hook cfi_break_hook = { .fn = cfi_handler, .imm = CFI_BRK_IMM_BASE, .mask = CFI_BRK_IMM_MASK, }; #endif /* CONFIG_CFI_CLANG */ static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr) { pr_err("%s generated an invalid instruction at %pS!\n", "Kernel text patching", (void *)instruction_pointer(regs)); /* We cannot handle this */ return DBG_HOOK_ERROR; } static struct break_hook fault_break_hook = { .fn = reserved_fault_handler, .imm = FAULT_BRK_IMM, }; #ifdef CONFIG_KASAN_SW_TAGS #define KASAN_ESR_RECOVER 0x20 #define KASAN_ESR_WRITE 0x10 #define KASAN_ESR_SIZE_MASK 0x0f #define KASAN_ESR_SIZE(esr) (1 << ((esr) & KASAN_ESR_SIZE_MASK)) static int kasan_handler(struct pt_regs *regs, unsigned long esr) { bool recover = esr & KASAN_ESR_RECOVER; bool write = esr & KASAN_ESR_WRITE; size_t size = KASAN_ESR_SIZE(esr); void *addr = (void *)regs->regs[0]; u64 pc = regs->pc; kasan_report(addr, size, write, pc); /* * The instrumentation allows to control whether we can proceed after * a crash was detected. This is done by passing the -recover flag to * the compiler. Disabling recovery allows to generate more compact * code. * * Unfortunately disabling recovery doesn't work for the kernel right * now. KASAN reporting is disabled in some contexts (for example when * the allocator accesses slab object metadata; this is controlled by * current->kasan_depth). All these accesses are detected by the tool, * even though the reports for them are not printed. * * This is something that might be fixed at some point in the future. */ if (!recover) die("Oops - KASAN", regs, esr); /* If thread survives, skip over the brk instruction and continue: */ arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } static struct break_hook kasan_break_hook = { .fn = kasan_handler, .imm = KASAN_BRK_IMM, .mask = KASAN_BRK_MASK, }; #endif #ifdef CONFIG_UBSAN_TRAP static int ubsan_handler(struct pt_regs *regs, unsigned long esr) { die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr); return DBG_HOOK_HANDLED; } static struct break_hook ubsan_break_hook = { .fn = ubsan_handler, .imm = UBSAN_BRK_IMM, .mask = UBSAN_BRK_MASK, }; #endif /* * Initial handler for AArch64 BRK exceptions * This handler only used until debug_traps_init(). */ int __init early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs) { #ifdef CONFIG_CFI_CLANG if (esr_is_cfi_brk(esr)) return cfi_handler(regs, esr) != DBG_HOOK_HANDLED; #endif #ifdef CONFIG_KASAN_SW_TAGS if ((esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif #ifdef CONFIG_UBSAN_TRAP if (esr_is_ubsan_brk(esr)) return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; } void __init trap_init(void) { register_kernel_break_hook(&bug_break_hook); #ifdef CONFIG_CFI_CLANG register_kernel_break_hook(&cfi_break_hook); #endif register_kernel_break_hook(&fault_break_hook); #ifdef CONFIG_KASAN_SW_TAGS register_kernel_break_hook(&kasan_break_hook); #endif #ifdef CONFIG_UBSAN_TRAP register_kernel_break_hook(&ubsan_break_hook); #endif debug_traps_init(); }
289 289 289 289 289 285 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * vma.h * * Core VMA manipulation API implemented in vma.c. */ #ifndef __MM_VMA_H #define __MM_VMA_H /* * VMA lock generalization */ struct vma_prepare { struct vm_area_struct *vma; struct vm_area_struct *adj_next; struct file *file; struct address_space *mapping; struct anon_vma *anon_vma; struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; bool skip_vma_uprobe :1; }; struct unlink_vma_file_batch { int count; struct vm_area_struct *vmas[8]; }; /* * vma munmap operation */ struct vma_munmap_struct { struct vma_iterator *vmi; struct vm_area_struct *vma; /* The first vma to munmap */ struct vm_area_struct *prev; /* vma before the munmap area */ struct vm_area_struct *next; /* vma after the munmap area */ struct list_head *uf; /* Userfaultfd list_head */ unsigned long start; /* Aligned start addr (inclusive) */ unsigned long end; /* Aligned end addr (exclusive) */ unsigned long unmap_start; /* Unmap PTE start */ unsigned long unmap_end; /* Unmap PTE end */ int vma_count; /* Number of vmas that will be removed */ bool unlock; /* Unlock after the munmap */ bool clear_ptes; /* If there are outstanding PTE to be cleared */ /* 2 byte hole */ unsigned long nr_pages; /* Number of pages being removed */ unsigned long locked_vm; /* Number of locked pages */ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ unsigned long exec_vm; unsigned long stack_vm; unsigned long data_vm; }; enum vma_merge_state { VMA_MERGE_START, VMA_MERGE_ERROR_NOMEM, VMA_MERGE_NOMERGE, VMA_MERGE_SUCCESS, }; /* * Describes a VMA merge operation and is threaded throughout it. * * Any of the fields may be mutated by the merge operation, so no guarantees are * made to the contents of this structure after a merge operation has completed. */ struct vma_merge_struct { struct mm_struct *mm; struct vma_iterator *vmi; /* * Adjacent VMAs, any of which may be NULL if not present: * * |------|--------|------| * | prev | middle | next | * |------|--------|------| * * middle may not yet exist in the case of a proposed new VMA being * merged, or it may be an existing VMA. * * next may be assigned by the caller. */ struct vm_area_struct *prev; struct vm_area_struct *middle; struct vm_area_struct *next; /* This is the VMA we ultimately target to become the merged VMA. */ struct vm_area_struct *target; /* * Initially, the start, end, pgoff fields are provided by the caller * and describe the proposed new VMA range, whether modifying an * existing VMA (which will be 'middle'), or adding a new one. * * During the merge process these fields are updated to describe the new * range _including those VMAs which will be merged_. */ unsigned long start; unsigned long end; pgoff_t pgoff; unsigned long flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; enum vma_merge_state state; /* Flags which callers can use to modify merge behaviour: */ /* * If we can expand, simply do so. We know there is nothing to merge to * the right. Does not reset state upon failure to merge. The VMA * iterator is assumed to be positioned at the previous VMA, rather than * at the gap. */ bool just_expand :1; /* * If a merge is possible, but an OOM error occurs, give up and don't * execute the merge, returning NULL. */ bool give_up_on_oom :1; /* * If set, skip uprobe_mmap upon merged vma. */ bool skip_vma_uprobe :1; /* Internal flags set during merge process: */ /* * Internal flag indicating the merge increases vmg->middle->vm_start * (and thereby, vmg->prev->vm_end). */ bool __adjust_middle_start :1; /* * Internal flag indicating the merge decreases vmg->next->vm_start * (and thereby, vmg->middle->vm_end). */ bool __adjust_next_start :1; /* * Internal flag used during the merge operation to indicate we will * remove vmg->middle. */ bool __remove_middle :1; /* * Internal flag used during the merge operationr to indicate we will * remove vmg->next. */ bool __remove_next :1; }; static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; } /* Assumes addr >= vma->vm_start. */ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr) { return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } #define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ .flags = flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ struct vma_merge_struct name = { \ .mm = vma_->vm_mm, \ .vmi = vmi_, \ .prev = prev_, \ .middle = vma_, \ .next = NULL, \ .start = start_, \ .end = end_, \ .flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ .policy = vma_policy(vma_), \ .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm); #else #define validate_mm(mm) do { } while (0) #endif __must_check int vma_expand(struct vma_merge_struct *vmg); __must_check int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; vma_mark_attached(vma); return 0; } /* * Temporary helper functions for file systems which wrap an invocation of * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { desc->mm = vma->vm_mm; desc->start = vma->vm_start; desc->end = vma->vm_end; desc->pgoff = vma->vm_pgoff; desc->file = vma->vm_file; desc->vm_flags = vma->vm_flags; desc->page_prot = vma->vm_page_prot; desc->vm_ops = NULL; desc->private_data = NULL; return desc; } static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { /* * Since we're invoking .mmap_prepare() despite having a partially * established VMA, we must take care to handle setting fields * correctly. */ /* Mutable fields. Populated with initial state. */ vma->vm_pgoff = desc->pgoff; if (vma->vm_file != desc->file) vma_set_file(vma, desc->file); if (vma->vm_flags != desc->vm_flags) vm_flags_set(vma, desc->vm_flags); vma->vm_page_prot = desc->page_prot; /* User-defined fields. */ vma->vm_ops = desc->vm_ops; vma->vm_private_data = desc->private_data; } int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ __must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); __must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); void unlink_file_vma(struct vm_area_struct *vma); void vma_link_file(struct vm_area_struct *vma); int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); bool vma_needs_dirty_tracking(struct vm_area_struct *vma); bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* * We want to check manually if we can change individual PTEs writable * if we can't do that automatically for all PTEs in a mapping. For * private mappings, that's always the case when we have write * permissions as we properly have to handle COW. */ if (vma->vm_flags & VM_SHARED) return vma_wants_writenotify(vma, vma->vm_page_prot); return !!(vma->vm_flags & VM_WRITE); } #ifdef CONFIG_MMU static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } #endif static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev(&vmi->mas, min); } /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { __mas_set_range(&vmi->mas, index, last - 1); } static inline void vma_iter_reset(struct vma_iterator *vmi) { mas_reset(&vmi->mas); } static inline struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev_range(&vmi->mas, min); } static inline struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) { return mas_next_range(&vmi->mas, max); } static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area(&vmi->mas, min, max - 1, size); } static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area_rev(&vmi->mas, min, max - 1, size); } /* * VMA Iterator functions shared between nommu and mmap */ static inline int vma_iter_prealloc(struct vma_iterator *vmi, struct vm_area_struct *vma) { return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi) { mas_store_prealloc(&vmi->mas, NULL); } static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); } /* Store a VMA with preallocated memory */ static inline void vma_iter_store_overwrite(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_assert_attached(vma); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } #endif if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } static inline void vma_iter_store_new(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_mark_attached(vma); vma_iter_store_overwrite(vmi, vma); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, unsigned long count) { return mas_expected_entries(&vmi->mas, count); } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } /* * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or * if no previous VMA, to index 0. */ static inline struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, struct vm_area_struct **pprev) { struct vm_area_struct *next = vma_next(vmi); struct vm_area_struct *prev = vma_prev(vmi); /* * Consider the case where no previous VMA exists. We advance to the * next VMA, skipping any gap, then rewind to the start of the range. * * If we were to unconditionally advance to the next range we'd wind up * at the next VMA again, so we check to ensure there is a previous VMA * to skip over. */ if (prev) vma_iter_next_range(vmi); if (pprev) *pprev = prev; return next; } #ifdef CONFIG_64BIT static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } /* * check if a vma is sealed for modification. * return true, if modification is allowed. */ static inline bool can_modify_vma(struct vm_area_struct *vma) { if (unlikely(vma_is_sealed(vma))) return false; return true; } bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior); #else static inline bool can_modify_vma(struct vm_area_struct *vma) { return true; } static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior) { return true; } #endif #if defined(CONFIG_STACK_GROWSUP) int expand_upwards(struct vm_area_struct *vma, unsigned long address); #endif int expand_downwards(struct vm_area_struct *vma, unsigned long address); int __vm_munmap(unsigned long start, size_t len, bool unlock); int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); /* vma_init.h, shared between CONFIG_MMU and nommu. */ void __init vma_state_init(void); struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); void vm_area_free(struct vm_area_struct *vma); /* vma_exec.c */ #ifdef CONFIG_MMU int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, unsigned long *top_mem_p); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif #endif /* __MM_VMA_H */
53 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAPOPS_H #define _LINUX_SWAPOPS_H #include <linux/radix-tree.h> #include <linux/bug.h> #include <linux/mm_types.h> #ifdef CONFIG_MMU #ifdef CONFIG_SWAP #include <linux/swapfile.h> #endif /* CONFIG_SWAP */ /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in * the low-order bits. * * We arrange the `type' and `offset' fields so that `type' is at the six * high-order bits of the swp_entry_t and `offset' is right-aligned in the * remaining bits. Although `type' itself needs only five bits, we allow for * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ #define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ #ifdef MAX_PHYSMEM_BITS #define SWP_PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) #else /* MAX_PHYSMEM_BITS */ #define SWP_PFN_BITS min_t(int, \ sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \ SWP_TYPE_SHIFT) #endif /* MAX_PHYSMEM_BITS */ #define SWP_PFN_MASK (BIT(SWP_PFN_BITS) - 1) /** * Migration swap entry specific bitfield definitions. Layout: * * |----------+--------------------| * | swp_type | swp_offset | * |----------+--------+-+-+-------| * | | resv |D|A| PFN | * |----------+--------+-+-+-------| * * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A) * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D) * * Note: A/D bits will be stored in migration entries iff there're enough * free bits in arch specific swp offset. By default we'll ignore A/D bits * when migrating a page. Please refer to migration_entry_supports_ad() * for more information. If there're more bits besides PFN and A/D bits, * they should be reserved and always be zeros. */ #define SWP_MIG_YOUNG_BIT (SWP_PFN_BITS) #define SWP_MIG_DIRTY_BIT (SWP_PFN_BITS + 1) #define SWP_MIG_TOTAL_BITS (SWP_PFN_BITS + 2) #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) static inline bool is_pfn_swap_entry(swp_entry_t entry); /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { if (pte_swp_exclusive(pte)) pte = pte_swp_clear_exclusive(pte); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); if (pte_swp_uffd_wp(pte)) pte = pte_swp_clear_uffd_wp(pte); return pte; } /* * Store a type+offset into a swp_entry_t in an arch-independent format */ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) { swp_entry_t ret; ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK); return ret; } /* * Extract the `type' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline unsigned swp_type(swp_entry_t entry) { return (entry.val >> SWP_TYPE_SHIFT); } /* * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in * arch-independent format */ static inline pgoff_t swp_offset(swp_entry_t entry) { return entry.val & SWP_OFFSET_MASK; } /* * This should only be called upon a pfn swap entry to get the PFN stored * in the swap entry. Please refers to is_pfn_swap_entry() for definition * of pfn swap entry. */ static inline unsigned long swp_offset_pfn(swp_entry_t entry) { VM_BUG_ON(!is_pfn_swap_entry(entry)); return swp_offset(entry) & SWP_PFN_MASK; } /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. */ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; pte = pte_swp_clear_flags(pte); arch_entry = __pte_to_swp_entry(pte); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. */ static inline pte_t swp_entry_to_pte(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pte(arch_entry); } static inline swp_entry_t radix_to_swp_entry(void *arg) { swp_entry_t entry; entry.val = xa_to_value(arg); return entry; } static inline void *swp_to_radix_entry(swp_entry_t entry) { return xa_mk_value(entry.val); } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_READ, offset); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_WRITE, offset); } static inline bool is_device_private_entry(swp_entry_t entry) { int type = swp_type(entry); return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; } #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_private_entry(swp_entry_t entry) { return false; } static inline bool is_writable_device_private_entry(swp_entry_t entry) { return false; } static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline bool is_device_exclusive_entry(swp_entry_t entry) { return false; } #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION static inline int is_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ || swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_writable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); } static inline int is_readable_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ); } static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) { return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); } static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ, offset); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(SWP_MIGRATION_WRITE, offset); } /* * Returns whether the host has large enough swap offset field to support * carrying over pgtable A/D bits for page migrations. The result is * pretty much arch specific. */ static inline bool migration_entry_supports_ad(void) { #ifdef CONFIG_SWAP return swap_migration_ad_supported; #else /* CONFIG_SWAP */ return false; #endif /* CONFIG_SWAP */ } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_YOUNG); return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_YOUNG; /* Keep the old behavior of aging page after migration */ return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_entry(swp_type(entry), swp_offset(entry) | SWP_MIG_DIRTY); return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) return swp_offset(entry) & SWP_MIG_DIRTY; /* Keep the old behavior of clean page after migration */ return false; } extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); #else /* CONFIG_MIGRATION */ static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) { return swp_entry(0, 0); } static inline int is_migration_entry(swp_entry_t swp) { return 0; } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; } static inline int is_readable_migration_entry(swp_entry_t entry) { return 0; } static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_young(swp_entry_t entry) { return false; } static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } static inline bool is_migration_entry_dirty(swp_entry_t entry) { return false; } #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE /* * Support for hardware poisoned pages */ static inline swp_entry_t make_hwpoison_entry(struct page *page) { BUG_ON(!PageLocked(page)); return swp_entry(SWP_HWPOISON, page_to_pfn(page)); } static inline int is_hwpoison_entry(swp_entry_t entry) { return swp_type(entry) == SWP_HWPOISON; } #else static inline swp_entry_t make_hwpoison_entry(struct page *page) { return swp_entry(0, 0); } static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } #endif typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) /* * "Poisoned" here is meant in the very general sense of "future accesses are * invalid", instead of referring very specifically to hardware memory errors. * This marker is meant to represent any of various different causes of this. * * Note that, when encountered by the faulting logic, PTEs with this marker will * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error * logic. */ #define PTE_MARKER_POISONED BIT(1) /* * Indicates that, on fault, this PTE will case a SIGSEGV signal to be * sent. This means guard markers behave in effect as if the region were mapped * PROT_NONE, rather than if they were a memory hole or equivalent. */ #define PTE_MARKER_GUARD BIT(2) #define PTE_MARKER_MASK (BIT(3) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); } static inline bool is_pte_marker_entry(swp_entry_t entry) { return swp_type(entry) == SWP_PTE_MARKER; } static inline pte_marker pte_marker_get(swp_entry_t entry) { return swp_offset(entry) & PTE_MARKER_MASK; } static inline bool is_pte_marker(pte_t pte) { return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); } static inline swp_entry_t make_poisoned_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_POISONED); } static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_POISONED); } static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } static inline int is_guard_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_GUARD); } /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker * should be seen as a none pte; it's just that we have stored some information * onto the none pte so it becomes not-none any more. * * It should be used when the pte is file-backed, ram-based and backing * userspace pages, like shmem. It is not needed upon pgtables that do not * support pte markers at all. For example, it's not needed on anonymous * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. */ static inline int pte_none_mostly(pte_t pte) { return pte_none(pte) || is_pte_marker(pte); } static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding page is locked */ BUG_ON(is_migration_entry(entry) && !PageLocked(p)); return p; } static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) { struct folio *folio = pfn_folio(swp_offset_pfn(entry)); /* * Any use of migration entries may only occur while the * corresponding folio is locked */ BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); return folio; } /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They can either be used to represent unaddressable device * memory, to restrict access to a page undergoing migration or to represent a * pfn which has been hwpoisoned and unmapped. */ static inline bool is_pfn_swap_entry(swp_entry_t entry) { /* Make sure the swp offset can always store the needed fields */ BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); return is_migration_entry(entry) || is_device_private_entry(entry) || is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); } struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page); extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new); extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { swp_entry_t arch_entry; if (pmd_swp_soft_dirty(pmd)) pmd = pmd_swp_clear_soft_dirty(pmd); if (pmd_swp_uffd_wp(pmd)) pmd = pmd_swp_clear_uffd_wp(pmd); arch_entry = __pmd_to_swp_entry(pmd); return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); return __swp_entry_to_pmd(arch_entry); } static inline int is_pmd_migration_entry(pmd_t pmd) { return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { BUILD_BUG(); } static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { BUILD_BUG(); } static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) { return swp_entry(0, 0); } static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } static inline int is_pmd_migration_entry(pmd_t pmd) { return 0; } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */
222 3 227 3 231 230 1 229 230 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/export.h> #include <linux/fault-inject-usercopy.h> #include <linux/kasan-checks.h> #include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <asm/word-at-a-time.h> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS #define IS_UNALIGNED(src, dst) 0 #else #define IS_UNALIGNED(src, dst) \ (((long) dst | (long) src) & (sizeof(long) - 1)) #endif /* * Do a strncpy, return length of string without final '\0'. * 'count' is the user-supplied count (return 'count' if we * hit it), 'max' is the address space maximum (and we return * -EFAULT if we hit it). */ static __always_inline long do_strncpy_from_user(char *dst, const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long res = 0; if (IS_UNALIGNED(src, dst)) goto byte_at_a_time; while (max >= sizeof(unsigned long)) { unsigned long c, data, mask; /* Fall back to byte-at-a-time if we get a page fault */ unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); /* * Note that we mask out the bytes following the NUL. This is * important to do because string oblivious code may read past * the NUL. For those routines, we don't want to give them * potentially random bytes after the NUL in `src`. * * One example of such code is BPF map keys. BPF treats map keys * as an opaque set of bytes. Without the post-NUL mask, any BPF * maps keyed by strings returned from strncpy_from_user() may * have multiple entries for semantically identical strings. */ if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); mask = zero_bytemask(data); *(unsigned long *)(dst+res) = c & mask; return res + find_zero(data); } *(unsigned long *)(dst+res) = c; res += sizeof(unsigned long); max -= sizeof(unsigned long); } byte_at_a_time: while (max) { char c; unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; res++; max--; } /* * Uhhuh. We hit 'max'. But was that the user-specified maximum * too? If so, that's ok - we got as much as the user asked for. */ if (res >= count) return res; /* * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ efault: return -EFAULT; } /** * strncpy_from_user: - Copy a NUL terminated string from userspace. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @src: Source address, in user space. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from userspace to kernel space. * * On success, returns the length of the string (not including the trailing * NUL). * * If access to userspace fails, returns -EFAULT (some data may have been * copied). * * If @count is smaller than the length of the string, copies @count bytes * and returns @count. */ long strncpy_from_user(char *dst, const char __user *src, long count) { unsigned long max_addr, src_addr; might_fault(); if (should_fail_usercopy()) return -EFAULT; if (unlikely(count <= 0)) return 0; kasan_check_write(dst, count); check_object_size(dst, count, false); if (can_do_masked_user_access()) { long retval; src = masked_user_access_begin(src); retval = do_strncpy_from_user(dst, src, count, count); user_read_access_end(); return retval; } max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; /* * Truncate 'max' to the user-specified limit, so that * we only have one limit we need to check in the loop */ if (max > count) max = count; if (user_read_access_begin(src, max)) { retval = do_strncpy_from_user(dst, src, count, max); user_read_access_end(); return retval; } } return -EFAULT; } EXPORT_SYMBOL(strncpy_from_user);
184 54 296 24 363 1 39 218 218 218 217 761 194 421 248 15 118 15 6 10 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* internal.h: mm/ internal definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H #include <linux/fs.h> #include <linux/khugepaged.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/pagemap.h> #include <linux/pagewalk.h> #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/swap_cgroup.h> #include <linux/tracepoint-defs.h> /* Internal core VMA manipulation functions. */ #include "vma.h" struct folio_batch; /* * Maintains state across a page table move. The operation assumes both source * and destination VMAs already exist and are specified by the user. * * Partial moves are permitted, but the old and new ranges must both reside * within a VMA. * * mmap lock must be held in write and VMA write locks must be held on any VMA * that is visible. * * Use the PAGETABLE_MOVE() macro to initialise this struct. * * The old_addr and new_addr fields are updated as the page table move is * executed. * * NOTE: The page table move is affected by reading from [old_addr, old_end), * and old_addr may be updated for better page table alignment, so len_in * represents the length of the range being copied as specified by the user. */ struct pagetable_move_control { struct vm_area_struct *old; /* Source VMA. */ struct vm_area_struct *new; /* Destination VMA. */ unsigned long old_addr; /* Address from which the move begins. */ unsigned long old_end; /* Exclusive address at which old range ends. */ unsigned long new_addr; /* Address to move page tables to. */ unsigned long len_in; /* Bytes to remap specified by user. */ bool need_rmap_locks; /* Do rmap locks need to be taken? */ bool for_stack; /* Is this an early temp stack being moved? */ }; #define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \ struct pagetable_move_control name = { \ .old = old_, \ .new = new_, \ .old_addr = old_addr_, \ .old_end = (old_addr_) + (len_), \ .new_addr = new_addr_, \ .len_in = len_, \ } /* * The set of flags that only affect watermark checking and reclaim * behaviour. This is used by the MM to obey the caller constraints * about IO, FS and watermark checking while ignoring placement * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) /* Control allocation cpuset and node placement constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. */ #define WARN_ON_ONCE_GFP(cond, gfp) ({ \ static bool __section(".data..once") __warned; \ int __ret_warn_once = !!(cond); \ \ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ __warned = true; \ WARN_ON(1); \ } \ unlikely(__ret_warn_once); \ }) void page_writeback_init(void); /* * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages, * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently * leaves nr_pages_mapped at 0, but avoid surprise if it participates later. */ #define ENTIRELY_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (ENTIRELY_MAPPED - 1) /* * Flags passed to __show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. * * Don't use this function outside of debugging code. */ static inline int folio_nr_pages_mapped(const struct folio *folio) { if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) return -1; return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED; } /* * Retrieve the first entry of a folio based on a provided entry within the * folio. We cannot rely on folio->swap as there is no guarantee that it has * been initialized. Used for calling arch_swap_restore() */ static inline swp_entry_t folio_swap(swp_entry_t entry, const struct folio *folio) { swp_entry_t swap = { .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)), }; return swap; } static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } /* * This is a file-backed mapping, and is about to be memory mapped - invoke its * mmap hook and safely handle error conditions. On error, VMA hooks will be * mutated. * * @file: File which backs the mapping. * @vma: VMA which we are mapping. * * Returns: 0 if success, error otherwise. */ static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { int err = call_mmap(file, vma); if (likely(!err)) return 0; /* * OK, we tried to call the file hook for mmap(), but an error * arose. The mapping is in an inconsistent state and we most not invoke * any further hooks on it. */ vma->vm_ops = &vma_dummy_vm_ops; return err; } /* * If the VMA has a close hook then close it, and since closing it might leave * it in an inconsistent state which makes the use of any hooks suspect, clear * them down by installing dummy empty hooks. */ static inline void vma_close(struct vm_area_struct *vma) { if (vma->vm_ops && vma->vm_ops->close) { vma->vm_ops->close(vma); /* * The mapping is in an inconsistent state, and no further hooks * may be invoked upon it. */ vma->vm_ops = &vma_dummy_vm_ops; } } #ifdef CONFIG_MMU /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; /* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ #define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) /* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ #define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { if (flags & FPB_IGNORE_DIRTY) pte = pte_mkclean(pte); if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) pte = pte_clear_soft_dirty(pte); return pte_wrprotect(pte_mkold(pte)); } /** * folio_pte_batch - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. * @addr: The user virtual address the first page is mapped at. * @start_ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. * @any_writable: Optional pointer to indicate whether any entry except the * first one is writable. * @any_young: Optional pointer to indicate whether any entry except the * first one is young. * @any_dirty: Optional pointer to indicate whether any entry except the * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). * * start_ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { pte_t expected_pte, *ptep; bool writable, young, dirty; int nr, cur_nr; if (any_writable) *any_writable = false; if (any_young) *any_young = false; if (any_dirty) *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ max_nr = min_t(unsigned long, max_nr, folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); nr = pte_batch_hint(start_ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); ptep = start_ptep + nr; while (nr < max_nr) { pte = ptep_get(ptep); if (any_writable) writable = !!pte_write(pte); if (any_young) young = !!pte_young(pte); if (any_dirty) dirty = !!pte_dirty(pte); pte = __pte_batch_clear_ignored(pte, flags); if (!pte_same(pte, expected_pte)) break; if (any_writable) *any_writable |= writable; if (any_young) *any_young |= young; if (any_dirty) *any_dirty |= dirty; cur_nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, cur_nr); ptep += cur_nr; nr += cur_nr; } return min(nr, max_nr); } /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * @delta: The direction and the offset we are moving; forward if delta * is positive; backward if delta is negative * * Moves the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_move_swp_offset(pte_t pte, long delta) { swp_entry_t entry = pte_to_swp_entry(pte); pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry), (swp_offset(entry) + delta))); if (pte_swp_soft_dirty(pte)) new = pte_swp_mksoft_dirty(new); if (pte_swp_exclusive(pte)) new = pte_swp_mkexclusive(new); if (pte_swp_uffd_wp(pte)) new = pte_swp_mkuffd_wp(new); return new; } /** * pte_next_swp_offset - Increment the swap entry offset field of a swap pte. * @pte: The initial pte state; is_swap_pte(pte) must be true and * non_swap_entry() must be false. * * Increments the swap offset, while maintaining all other fields, including * swap type, and any swp pte bits. The resulting pte is returned. */ static inline pte_t pte_next_swp_offset(pte_t pte) { return pte_move_swp_offset(pte, 1); } /** * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries * @start_ptep: Page table pointer for the first entry. * @max_nr: The maximum number of table entries to consider. * @pte: Page table entry for the first entry. * * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs * containing swap entries all with consecutive offsets and targeting the same * swap type, all with matching swp pte bits. * * max_nr must be at least one and must be limited by the caller so scanning * cannot exceed a single page table. * * Return: the number of table entries in the batch. */ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte) { pte_t expected_pte = pte_next_swp_offset(pte); const pte_t *end_ptep = start_ptep + max_nr; swp_entry_t entry = pte_to_swp_entry(pte); pte_t *ptep = start_ptep + 1; unsigned short cgroup_id; VM_WARN_ON(max_nr < 1); VM_WARN_ON(!is_swap_pte(pte)); VM_WARN_ON(non_swap_entry(entry)); cgroup_id = lookup_swap_cgroup_id(entry); while (ptep < end_ptep) { pte = ptep_get(ptep); if (!pte_same(pte, expected_pte)) break; if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id) break; expected_pte = pte_next_swp_offset(expected_pte); ptep++; } return ptep - start_ptep; } #endif /* CONFIG_MMU */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, int nr_throttled); static inline void acct_reclaim_writeback(struct folio *folio) { pg_data_t *pgdat = folio_pgdat(folio); int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); if (nr_throttled) __acct_reclaim_writeback(pgdat, folio, nr_throttled); } static inline void wake_throttle_isolated(pg_data_t *pgdat) { wait_queue_head_t *wqh; wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; if (waitqueue_active(wqh)) wake_up(wqh); } vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf); static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { vm_fault_t ret = __vmf_anon_prepare(vmf); if (unlikely(ret & VM_FAULT_RETRY)) vma_end_read(vmf->vma); return ret; } vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details); void zap_page_range_single_batched(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long size, struct zap_details *details); int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, unsigned int order); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) { DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); force_page_cache_ra(&ractl, nr_to_read); } unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end); long mapping_evict_folio(struct address_space *mapping, struct folio *folio); unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed); /** * folio_evictable - Test whether a folio is evictable. * @folio: The folio to test. * * Test whether @folio is evictable -- i.e., should be placed on * active/inactive lists vs unevictable list. * * Reasons folio might not be evictable: * 1. folio's mapping marked unevictable * 2. One of the pages in the folio is part of an mlocked VMA */ static inline bool folio_evictable(struct folio *folio) { bool ret; /* Prevent address_space of inode and swap cache from being freed */ rcu_read_lock(); ret = !mapping_unevictable(folio_mapping(folio)) && !folio_test_mlocked(folio); rcu_read_unlock(); return ret; } /* * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } /* * Return true if a folio needs ->release_folio() calling upon it. */ static inline bool folio_needs_release(struct folio *folio) { struct address_space *mapping = folio_mapping(folio); return folio_has_private(folio) || (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; /* * Maximum number of reclaim retries without progress before the OOM * killer is consider the only way forward. */ #define MAX_RECLAIM_RETRIES 16 /* * in mm/vmscan.c: */ bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); /* * in mm/rmap.c: */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c */ #define K(x) ((x) << (PAGE_SHIFT-10)) extern char * const zone_names[MAX_NR_ZONES]; /* perform sanity checks on struct pages being allocated or freed */ DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); extern int min_free_kbytes; extern int defrag_mode; void setup_per_zone_wmarks(void); void calculate_min_free_kbytes(void); int __meminit init_per_zone_wmark_min(void); void page_alloc_sysctl_init(void); /* * Structure for holding the mostly immutable allocation parameters passed * between functions involved in allocations, including the alloc_pages* * family of functions. * * nodemask, migratetype and highest_zoneidx are initialized only once in * __alloc_pages() and then never change. * * zonelist, preferred_zone and highest_zoneidx are set first in * __alloc_pages() for the fast path, and might be later changed * in __alloc_pages_slowpath(). All other functions pass the whole structure * by a const pointer. */ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; int migratetype; /* * highest_zoneidx represents highest usable zone index of * the allocation request. Due to the nature of the zone, * memory on lower zone than the highest_zoneidx will be * protected by lowmem_reserve[highest_zoneidx]. * * highest_zoneidx is also used by reclaim/compaction to limit * the target zone since higher zone than this index cannot be * usable for this allocation request. */ enum zone_type highest_zoneidx; bool spread_dirty_pages; }; /* * This function returns the order of a free page in the buddy system. In * general, page_zone(page)->lock must be held by the caller to prevent the * page from being allocated in parallel and returning garbage as the order. * If a caller does not hold page_zone(page)->lock, it must guarantee that the * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use buddy_order_unsafe() below. */ static inline unsigned int buddy_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); } /* * Like buddy_order(), but for callers who cannot afford to hold the zone lock. * PageBuddy() should be checked first by the caller to minimize race window, * and invalid values must be handled gracefully. * * READ_ONCE is used so that if the caller assigns the result into a local * variable and e.g. tests it for valid range before using, the compiler cannot * decide to remove the variable and inline the page_private(page) multiple * times, potentially observing different values in the tests and the actual * use of the result. */ #define buddy_order_unsafe(page) READ_ONCE(page_private(page)) /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * * For recording whether a page is in the buddy system, we set PageBuddy. * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ static inline bool page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { if (!page_is_guard(buddy) && !PageBuddy(buddy)) return false; if (buddy_order(buddy) != order) return false; /* * zone check is done late to avoid uselessly calculating * zone/node ids for pages that could never merge. */ if (page_zone_id(page) != page_zone_id(buddy)) return false; VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); return true; } /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). * * 1) Any buddy B1 will have an order O twin B2 which satisfies * the following equation: * B2 = B1 ^ (1 << O) * For example, if the starting buddy (buddy2) is #8 its order * 1 buddy is #10: * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 * * 2) Any buddy B will have an order O+1 parent P which * satisfies the following equation: * P = B & ~(1 << O) * * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER */ static inline unsigned long __find_buddy_pfn(unsigned long page_pfn, unsigned int order) { return page_pfn ^ (1 << order); } /* * Find the buddy of @page and validate it. * @page: The input page * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the * function is used in the performance-critical __free_one_page(). * @order: The order of the page * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to * page_to_pfn(). * * The found buddy can be a non PageBuddy, out of @page's zone, or its order is * not the same as @page. The validation is necessary before use it. * * Return: the found buddy page or NULL if not found. */ static inline struct page *find_buddy_page_pfn(struct page *page, unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) { unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); struct page *buddy; buddy = page + (__buddy_pfn - pfn); if (buddy_pfn) *buddy_pfn = __buddy_pfn; if (page_is_buddy(page, buddy, order)) return buddy; return NULL; } extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone); static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, unsigned long end_pfn, struct zone *zone) { if (zone->contiguous) return pfn_to_page(start_pfn); return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); } void set_zone_contiguous(struct zone *zone); bool pfn_range_intersects_zones(int nid, unsigned long start_pfn, unsigned long nr_pages); static inline void clear_zone_contiguous(struct zone *zone) { zone->contiguous = false; } extern int __isolate_free_page(struct page *page, unsigned int order); extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order, enum meminit_context context); /* * This will have no effect, other than possibly generating a warning, if the * caller passes in a non-large folio. */ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 1U << order; #endif } bool __folio_unqueue_deferred_split(struct folio *folio); static inline bool folio_unqueue_deferred_split(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) return false; /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) return false; return __folio_unqueue_deferred_split(folio); } static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; if (folio && folio_test_large(folio)) folio_set_large_rmappable(folio); return folio; } static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; folio_set_order(folio, order); atomic_set(&folio->_large_mapcount, -1); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) atomic_set(&folio->_nr_pages_mapped, 0); if (IS_ENABLED(CONFIG_MM_ID)) { folio->_mm_ids = 0; folio->_mm_id_mapcount[0] = -1; folio->_mm_id_mapcount[1] = -1; } if (IS_ENABLED(CONFIG_64BIT) || order > 1) { atomic_set(&folio->_pincount, 0); atomic_set(&folio->_entire_mapcount, -1); } if (order > 1) INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) { struct page *p = head + tail_idx; p->mapping = TAIL_MAPPING; set_compound_head(p, head); set_page_private(p, 0); } void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid, nodemask_t *); #define __alloc_frozen_pages(...) \ alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__)) void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); #ifdef CONFIG_NUMA struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order); #else static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order) { return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL); } #endif #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); extern void zone_pcp_init(struct zone *zone); extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int); #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* * in mm/compaction.c */ /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts * at the end of a zone and migrate_pfn begins at the start. Movable pages * are moved to the end of a zone during a compaction run and the run * completes when free_pfn <= migrate_pfn */ struct compact_control { struct list_head freepages[NR_PAGE_ORDERS]; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ /* * Acts as an in/out parameter to page isolation for migration. * isolate_migratepages uses it as a search base. * isolate_migratepages_block will update the value to the next pfn * after the last isolated one. */ unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; unsigned long total_free_scanned; unsigned short fast_search_fail;/* failures to use free list searches */ short search_order; /* order to start a fast search at */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ int order; /* order a direct compactor needs */ int migratetype; /* migratetype of direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int highest_zoneidx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock contention */ bool finish_pageblock; /* Scan the remainder of a pageblock. Used * when there are potentially transient * isolation or migration failures to * ensure forward progress. */ bool alloc_contig; /* alloc_contig_range allocation */ }; /* * Used in direct compaction when a page should be taken from the freelists * immediately when one is created during the free path. */ struct capture_control { struct compact_control *cc; struct page *page; }; unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void init_cma_reserved_pageblock(struct page *page); #endif /* CONFIG_COMPACTION || CONFIG_CMA */ struct cma; #ifdef CONFIG_CMA void *cma_reserve_early(struct cma *cma, unsigned long size); void init_cma_pageblock(struct page *page); #else static inline void *cma_reserve_early(struct cma *cma, unsigned long size) { return NULL; } static inline void init_cma_pageblock(struct page *page) { } #endif int find_suitable_fallback(struct free_area *area, unsigned int order, int migratetype, bool claimable); static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } /* mm/util.c */ struct anon_vma *folio_anon_vma(const struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: * Caller knows at least 1 page of folio is associated with page table of VMA * and the range [start, end) is intersect with the VMA range. Caller wants * to know whether the folio is fully associated with the range. It calls * this function to check whether the folio is in the range first. Then checks * the page table to know whether the folio is fully mapped to the range. */ static inline bool folio_within_range(struct folio *folio, struct vm_area_struct *vma, unsigned long start, unsigned long end) { pgoff_t pgoff, addr; unsigned long vma_pglen = vma_pages(vma); VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); if (start > end) return false; if (start < vma->vm_start) start = vma->vm_start; if (end > vma->vm_end) end = vma->vm_end; pgoff = folio_pgoff(folio); /* if folio start address is not in vma range */ if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) return false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); return !(addr < start || end - addr < folio_size(folio)); } static inline bool folio_within_vma(struct folio *folio, struct vm_area_struct *vma) { return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); } /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, * under page table lock for the pte/pmd being added or removed. * * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at * the end of folio_remove_rmap_*(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. * 1) VM_IO check prevents migration from double-counting during mlock. * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED * is never left set on a VM_SPECIAL vma, there is an interval while * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, struct vm_area_struct *vma) { /* * munlock if the function is called. Ideally, we should only * do munlock if any page of folio is unmapped from VMA and * cause folio not fully mapped to VMA. * * But it's not easy to confirm that's the situation. So we * always munlock the folio and page reclaim will correct it * if it's wrong. */ if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } void mlock_new_folio(struct folio *folio); bool need_mlock_drain(int cpu); void mlock_drain_local(void); void mlock_drain_remote(int cpu); extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /** * vma_address - Find the virtual address a page range is mapped at * @vma: The vma which maps this object. * @pgoff: The page offset within its object. * @nr_pages: The number of pages to consider. * * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ static inline unsigned long vma_address(const struct vm_area_struct *vma, pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; if (pgoff >= vma->vm_pgoff) { address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address >= vma->vm_end) address = -EFAULT; } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { /* Test above avoids possibility of wrap to 0 on 32-bit */ address = vma->vm_start; } else { address = -EFAULT; } return address; } /* * Then at what user virtual address will none of the range be found in vma? * Assumes that vma_address() already returned a good starting address. */ static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) { struct vm_area_struct *vma = pvmw->vma; pgoff_t pgoff; unsigned long address; /* Common case, plus ->pgoff is invalid for KSM */ if (pvmw->nr_pages == 1) return pvmw->address + PAGE_SIZE; pgoff = pvmw->pgoff + pvmw->nr_pages; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); /* Check for address beyond vma (or wrapped through 0?) */ if (address < vma->vm_start || address > vma->vm_end) address = vma->vm_end; return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, struct file *fpin) { int flags = vmf->flags; if (fpin) return fpin; /* * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or * anything, so we only pin the file and drop the mmap_lock if only * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. */ if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); release_fault_lock(vmf); } return fpin; } #else /* !CONFIG_MMU */ static inline void unmap_mapping_folio(struct folio *folio) { } static inline void mlock_new_folio(struct folio *folio) { } static inline bool need_mlock_drain(int cpu) { return false; } static inline void mlock_drain_local(void) { } static inline void mlock_drain_remote(int cpu) { } static inline void vunmap_range_noflush(unsigned long start, unsigned long end) { } #endif /* !CONFIG_MMU */ /* Memory initialisation debug and verification */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT DECLARE_STATIC_KEY_TRUE(deferred_pages); bool __init deferred_grow_zone(struct zone *zone, unsigned int order); #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void init_deferred_page(unsigned long pfn, int nid); enum mminit_level { MMINIT_WARNING, MMINIT_VERIFY, MMINIT_TRACE }; #ifdef CONFIG_DEBUG_MEMORY_INIT extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) extern void mminit_verify_pageflags_layout(void); extern void mminit_verify_zonelist(void); #else static inline void mminit_dprintk(enum mminit_level level, const char *prefix, const char *fmt, ...) { } static inline void mminit_verify_pageflags_layout(void) { } static inline void mminit_verify_zonelist(void) { } #endif /* CONFIG_DEBUG_MEMORY_INIT */ #define NODE_RECLAIM_NOSCAN -2 #define NODE_RECLAIM_FULL -1 #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); extern int find_next_best_node(int node, nodemask_t *used_node_mask); #else #define node_reclaim_mode 0 static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, unsigned int order) { return NODE_RECLAIM_NOSCAN; } static inline int find_next_best_node(int node, nodemask_t *used_node_mask) { return NUMA_NO_NODE; } #endif static inline bool node_reclaim_enabled(void) { /* Is any node_reclaim_mode bit set? */ return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); } /* * mm/memory-failure.c */ #ifdef CONFIG_MEMORY_FAILURE int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; extern u32 hwpoison_filter_dev_minor; extern u64 hwpoison_filter_flags_mask; extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; #define MAGIC_HWPOISON 0x48575053U /* HWPS */ void SetPageHWPoisonTakenOff(struct page *page); void ClearPageHWPoisonTakenOff(struct page *page); bool take_page_off_buddy(struct page *page); bool put_page_back_buddy(struct page *page); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); unsigned long page_mapped_in_vma(const struct page *page, struct vm_area_struct *vma); #else static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill) { return -EBUSY; } #endif extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); /* The ALLOC_WMARK bits are used as an index to zone->watermark */ #define ALLOC_WMARK_MIN WMARK_MIN #define ALLOC_WMARK_LOW WMARK_LOW #define ALLOC_WMARK_HIGH WMARK_HIGH #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) /* * Only MMU archs have async oom victim reclaim - aka oom_reaper so we * cannot assume a reduced access to memory reserves is sufficient for * !MMU */ #ifdef CONFIG_MMU #define ALLOC_OOM 0x08 #else #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif #define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access * to 25% of the min watermark or * 62.5% if __GFP_HIGH is set. */ #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% * of the min watermark. */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #ifdef CONFIG_ZONE_DMA32 #define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ #else #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ #define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ #define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) enum ttu_flags; struct tlbflush_unmap_batch; /* * only for MM internal work items which do not depend on * any allocations or locks which might depend on allocations */ extern struct workqueue_struct *mm_percpu_wq; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { } static inline void try_to_unmap_flush_dirty(void) { } static inline void flush_tlb_batched_pending(struct mm_struct *mm) { } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; static inline bool is_migrate_highatomic(enum migratetype migratetype) { return migratetype == MIGRATE_HIGHATOMIC; } void setup_zone_pageset(struct zone *zone); struct migration_target_control { int nid; /* preferred node id */ nodemask_t *nmask; gfp_t gfp_mask; enum migrate_reason reason; }; /* * mm/filemap.c */ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, struct folio *folio, loff_t fpos, size_t size); /* * mm/vmalloc.c */ #ifdef CONFIG_MMU void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) { } static inline int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { return -EINVAL; } #endif int __must_check __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, unsigned long addr, int *flags, bool writable, int *last_cpupid); void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); /* * mm/gup.c */ int __must_check try_grab_folio(struct folio *folio, int refs, unsigned int flags); /* * mm/huge_memory.c */ void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, bool write); void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. */ static inline int get_order_from_str(const char *size_str, unsigned long valid_orders) { unsigned long size; char *endptr; int order; size = memparse(size_str, &endptr); if (!is_power_of_2(size)) return -EINVAL; order = get_order(size); if (BIT(order) & ~valid_orders) return -EINVAL; return order; } enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, /* a retry, previous pass started an IO */ FOLL_TRIED = 1 << 17, /* we are working on non-current tsk/mm */ FOLL_REMOTE = 1 << 18, /* pages must be released via unpin_user_page */ FOLL_PIN = 1 << 19, /* gup_fast: prevent fall-back to slow gup */ FOLL_FAST_ONLY = 1 << 20, /* allow unlocking the mmap lock */ FOLL_UNLOCKABLE = 1 << 21, /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */ FOLL_MADV_POPULATE = 1 << 22, }; #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \ FOLL_MADV_POPULATE) /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the * GUP pin will remain consistent with the pages mapped into the page tables * of the MM. * * Temporary unmapping of PageAnonExclusive() pages or clearing of * PageAnonExclusive() has to protect against concurrent GUP: * * Ordinary GUP: Using the PT lock * * GUP-fast and fork(): mm->write_protect_seq * * GUP-fast and KSM or temporary unmapping (swap, migration): see * folio_try_share_anon_rmap_*() * * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. * * If the vma is NULL, we're coming from the GUP-fast path and might have * to fallback to the slow path just to lookup the vma. */ static inline bool gup_must_unshare(struct vm_area_struct *vma, unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry * has to be writable -- and if it references (part of) an anonymous * folio, that part is required to be marked exclusive. */ if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN) return false; /* * Note: PageAnon(page) is stable until the page is actually getting * freed. */ if (!PageAnon(page)) { /* * We only care about R/O long-term pining: R/O short-term * pinning does not have the semantics to observe successive * changes through the process page tables. */ if (!(flags & FOLL_LONGTERM)) return false; /* We really need the vma ... */ if (!vma) return true; /* * ... because we only care about writable private ("COW") * mappings where we have to break COW early. */ return is_cow_mapping(vma->vm_flags); } /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */ if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) smp_rmb(); /* * Note that KSM pages cannot be exclusive, and consequently, * cannot get pinned. */ return !PageAnonExclusive(page); } extern bool mirrored_kernelcore; bool memblock_has_mirror(void); void memblock_free_all(void); static __always_inline void vma_set_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; } static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { /* * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty * enablements, because when without soft-dirty being compiled in, * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) * will be constantly true. */ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) return false; /* * Soft-dirty is kind of special: its tracking is enabled when the * vma flags not set. */ return !(vma->vm_flags & VM_SOFTDIRTY); } static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd) { return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd); } static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte) { return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte); } void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid); void __meminit __init_page_from_nid(unsigned long pfn, int nid); /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); return shrinker->name ? 0 : -ENOMEM; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { kfree_const(shrinker->name); shrinker->name = NULL; } extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, const char *fmt, va_list ap) { return 0; } static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) { } static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { *debugfs_id = -1; return NULL; } static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id) { } #endif /* CONFIG_SHRINKER_DEBUG */ /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes; #define mapping_set_update(xas, mapping) do { \ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ xas_set_update(xas, workingset_update_node); \ xas_set_lru(xas, &shadow_nodes); \ } \ } while (0) /* mremap.c */ unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { } #endif /* CONFIG_UNACCEPTED_MEMORY */ /* pagewalk.c */ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, pmd_t pmdval); void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, struct mmu_gather *tlb); #ifdef CONFIG_PT_RECLAIM bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details); #else static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, struct zap_details *details) { return false; } #endif /* CONFIG_PT_RECLAIM */ void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm); int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm); #endif /* __MM_INTERNAL_H */
416 417 1139 1136 418 418 419 419 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 // SPDX-License-Identifier: GPL-2.0 /* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> * * For policy-specific per-blkcg data: * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire * policy [un]register operations including cgroup file additions / * removals. Putting cgroup file registration outside blkcg_pol_mutex * allows grabbing it from cgroup callbacks. */ static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; static DEFINE_RAW_SPINLOCK(blkg_stat_lock); #define BLKG_DESTROY_BATCH_SIZE 64 /* * Lockless lists for tracking IO stats update * * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). * There are multiple blkg's (one for each block device) attached to each * blkcg. The rstat code keeps track of which cpu has IO stats updated, * but it doesn't know which blkg has the updated stats. If there are many * block devices in a system, the cost of iterating all the blkg's to flush * out the IO stats can be high. To reduce such overhead, a set of percpu * lockless lists (lhead) per blkcg are used to track the set of recently * updated iostat_cpu's since the last flush. An iostat_cpu will be put * onto the lockless list on the update side [blk_cgroup_bio_start()] if * not there yet and then removed when being flushed [blkcg_rstat_flush()]. * References to blkg are gotten and then put back in the process to * protect against blkg removal. * * Return: 0 if successful or -ENOMEM if allocation fails. */ static int init_blkcg_llists(struct blkcg *blkcg) { int cpu; blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); if (!blkcg->lhead) return -ENOMEM; for_each_possible_cpu(cpu) init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); return 0; } /** * blkcg_css - find the current css * * Find the css associated with either the kthread or the current task. * This may return a dying css, so it is up to the caller to use tryget logic * to confirm it is alive and well. */ static struct cgroup_subsys_state *blkcg_css(void) { struct cgroup_subsys_state *css; css = kthread_blkcg(); if (css) return css; return task_css(current, io_cgrp_id); } static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); struct request_queue *q = blkg->q; int i; /* * pd_free_fn() can also be called from blkcg_deactivate_policy(), * in order to make sure pd_free_fn() is called in order, the deletion * of the list blkg->q_node is delayed to here from blkg_destroy(), and * blkcg_mutex is used to synchronize blkg_free_workfn() and * blkcg_deactivate_policy(). */ mutex_lock(&q->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->parent) blkg_put(blkg->parent); spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { if (!blkg) return; /* * Both ->pd_free_fn() and request queue's release handler may * sleep, so free us by scheduling one work func */ INIT_WORK(&blkg->free_work, blkg_free_workfn); schedule_work(&blkg->free_work); } static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); struct blkcg *blkcg = blkg->blkcg; int cpu; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); #endif /* * Flush all the non-empty percpu lockless lists before releasing * us, given these stat belongs to us. * * blkg_stat_lock is for serializing blkg stat update */ for_each_possible_cpu(cpu) __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); blkg_free(blkg); } /* * A group is RCU protected, but having an rcu lock does not mean that one * can access all the fields of blkg and assume these are valid. For * example, don't try to follow throtl_data and request queue links. * * Having a reference to blkg under an rcu allows accesses to only values * local to groups like group stats and group rate limits. */ static void blkg_release(struct percpu_ref *ref) { struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); call_rcu(&blkg->rcu_head, __blkg_release); } #ifdef CONFIG_BLK_CGROUP_PUNT_BIO static struct workqueue_struct *blkcg_punt_bio_wq; static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; struct blk_plug plug; bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { need_plug = true; blk_start_plug(&plug); } while ((bio = bio_list_pop(&bios))) submit_bio(bio); if (need_plug) blk_finish_plug(&plug); } /* * When a shared kthread issues a bio for a cgroup, doing so synchronously can * lead to priority inversions as the kthread can be trapped waiting for that * cgroup. Use this helper instead of submit_bio to punt the actual issuing to * a dedicated per-blkcg work item to avoid such priority inversions. */ void blkcg_punt_bio_submit(struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; if (blkg->parent) { spin_lock(&blkg->async_bio_lock); bio_list_add(&blkg->async_bios, bio); spin_unlock(&blkg->async_bio_lock); queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); } else { /* never bounce for the root cgroup */ submit_bio(bio); } } EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); static int __init blkcg_punt_bio_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND | WQ_SYSFS, 0); if (!blkcg_punt_bio_wq) return -ENOMEM; return 0; } subsys_initcall(blkcg_punt_bio_init); #endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ /** * bio_blkcg_css - return the blkcg CSS associated with a bio * @bio: target bio * * This returns the CSS for the blkcg associated with a bio, or %NULL if not * associated. Callers are expected to either handle %NULL or know association * has been done prior to calling this. */ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { if (!bio || !bio->bi_blkg) return NULL; return &bio->bi_blkg->blkcg->css; } EXPORT_SYMBOL_GPL(bio_blkcg_css); /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest * * Return the parent blkcg of @blkcg. Can be called anytime. */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { return css_to_blkcg(blkcg->css.parent); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) goto out_exit_refcnt; if (!blk_get_queue(disk->queue)) goto out_free_iostat; blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; blkg->iostat.blkg = blkg; #ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); #endif u64_stats_init(&blkg->iostat.sync); for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; pd->online = false; } return blkg; out_free_pds: while (--i >= 0) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); blk_put_queue(disk->queue); out_free_iostat: free_percpu(blkg->iostat_cpu); out_exit_refcnt: percpu_ref_exit(&blkg->refcnt); out_free_blkg: kfree(blkg); return NULL; } /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { ret = -ENODEV; goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; } } blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; } blkg_get(blkg->parent); } /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) pol->pd_init_fn(blkg->pd[i]); } /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i]) { if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[i]); blkg->pd[i]->online = true; } } } blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); err_put_css: css_put(&blkcg->css); err_free_blkg: if (new_blkg) blkg_free(new_blkg); return ERR_PTR(ret); } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @disk: gendisk of interest * * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; spin_lock_irqsave(&q->queue_lock, flags); blkg = blkg_lookup(blkcg, q); if (blkg) { if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) rcu_assign_pointer(blkcg->blkg_hint, blkg); goto found; } /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; break; } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, disk, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; } if (pos == blkcg) break; } found: spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; int i; lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* * blkg stays on the queue list until blkg_free_workfn(), see details in * blkg_free_workfn(), hence this function can be called from * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before * blkg_free_workfn(). */ if (hlist_unhashed(&blkg->blkcg_node)) return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && blkg->pd[i]->online) { blkg->pd[i]->online = false; if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); hlist_del_init_rcu(&blkg->blkcg_node); /* * Both setting lookup hint to and clearing it from @blkg are done * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ percpu_ref_kill(&blkg->refcnt); } static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) continue; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); /* * in order to avoid holding the spin lock for too long, release * it when a batch of blkgs are destroyed. */ if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; spin_unlock_irq(&q->queue_lock); cond_resched(); goto restart; } } /* * Mark policy deactivated since policy offline has been done, and * the free is scheduled, so future blkcg_deactivate_policy() can * be bypassed */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (pol) __clear_bit(pol->plid, q->blkcg_pols); } q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] = src->bytes[i]; dst->ios[i] = src->ios[i]; } } static void __blkg_clear_stat(struct blkg_iostat_set *bis) { struct blkg_iostat cur = {0}; unsigned long flags; flags = u64_stats_update_begin_irqsave(&bis->sync); blkg_iostat_set(&bis->cur, &cur); blkg_iostat_set(&bis->last, &cur); u64_stats_update_end_irqrestore(&bis->sync, flags); } static void blkg_clear_stat(struct blkcg_gq *blkg) { int cpu; for_each_possible_cpu(cpu) { struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu); __blkg_clear_stat(s); } __blkg_clear_stat(&blkg->iostat); } static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* * Note that stat reset is racy - it doesn't synchronize against * stat updates. This is a debug feature which shouldn't exist * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { blkg_clear_stat(blkg); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_reset_stats_fn) pol->pd_reset_stats_fn(blkg->pd[i]); } } spin_unlock_irq(&blkcg->lock); mutex_unlock(&blkcg_pol_mutex); return 0; } const char *blkg_dev_name(struct blkcg_gq *blkg) { if (!blkg->q->disk) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data and the matching queue lock held. If @show_total * is %true, the sum of the return values from @prfill is printed with * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { struct blkcg_gq *blkg; u64 total = 0; rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); } EXPORT_SYMBOL_GPL(blkcg_print_blkgs); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { const char *dname = blkg_dev_name(pd->blkg); if (!dname) return 0; seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /** * blkg_conf_init - initialize a blkg_conf_ctx * @ctx: blkg_conf_ctx to initialize * @input: input string * * Initialize @ctx which can be used to parse blkg config input string @input. * Once initialized, @ctx can be used with blkg_conf_open_bdev() and * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). */ void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { *ctx = (struct blkg_conf_ctx){ .input = input }; } EXPORT_SYMBOL_GPL(blkg_conf_init); /** * blkg_conf_open_bdev - parse and open bdev for per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is * set to point past the device node prefix. * * This function may be called multiple times on @ctx and the extra calls become * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function * explicitly if bdev access is needed without resolving the blkcg / policy part * of @ctx->input. Returns -errno on error. */ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { char *input = ctx->input; unsigned int major, minor; struct block_device *bdev; int key_len; if (ctx->bdev) return 0; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; input += key_len; if (!isspace(*input)) return -EINVAL; input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor), false); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); return -ENODEV; } mutex_lock(&bdev->bd_queue->rq_qos_mutex); if (!disk_live(bdev->bd_disk)) { blkdev_put_no_open(bdev); mutex_unlock(&bdev->bd_queue->rq_qos_mutex); return -ENODEV; } ctx->body = input; ctx->bdev = bdev; return 0; } /* * Similar to blkg_conf_open_bdev, but additionally freezes the queue, * acquires q->elevator_lock, and ensures the correct locking order * between q->elevator_lock and q->rq_qos_mutex. * * This function returns negative error on failure. On success it returns * memflags which must be saved and later passed to blkg_conf_exit_frozen * for restoring the memalloc scope. */ unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx) { int ret; unsigned long memflags; if (ctx->bdev) return -EINVAL; ret = blkg_conf_open_bdev(ctx); if (ret < 0) return ret; /* * At this point, we haven’t started protecting anything related to QoS, * so we release q->rq_qos_mutex here, which was first acquired in blkg_ * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing * the queue and acquiring q->elevator_lock to maintain the correct * locking order. */ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue); mutex_lock(&ctx->bdev->bd_queue->elevator_lock); mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex); return memflags; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Parse per-blkg config update from @ctx->input and initialize @ctx * accordingly. On success, @ctx->body points to the part of @ctx->input * following MAJ:MIN, @ctx->bdev points to the target block device and * @ctx->blkg to the blkg being configured. * * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this * function returns with queue lock held and must be followed by * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkg_conf_ctx *ctx) __acquires(&bdev->bd_queue->queue_lock) { struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; ret = blkg_conf_open_bdev(ctx); if (ret) return ret; disk = ctx->bdev->bd_disk; q = disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). */ ret = blk_queue_enter(q, 0); if (ret) goto fail; spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { ret = -EOPNOTSUPP; goto fail_unlock; } blkg = blkg_lookup(blkcg, q); if (blkg) goto success; /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent; struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; goto fail_exit_queue; } spin_lock_irq(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; goto fail_preloaded; } blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; } } radix_tree_preload_end(); if (pos == blkcg) goto success; } success: blk_queue_exit(q); ctx->blkg = blkg; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); fail_exit_queue: blk_queue_exit(q); fail: /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue * can be bypassing for some time and it's always nice to * avoid busy looping. */ if (ret == -EBUSY) { msleep(10); ret = restart_syscall(); } return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_exit - clean up per-blkg config update * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * * Clean up after per-blkg config update. This function must be called on all * blkg_conf_ctx's initialized with blkg_conf_init(). */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); ctx->blkg = NULL; } if (ctx->bdev) { mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); blkdev_put_no_open(ctx->bdev); ctx->body = NULL; ctx->bdev = NULL; } } EXPORT_SYMBOL_GPL(blkg_conf_exit); /* * Similar to blkg_conf_exit, but also unfreezes the queue and releases * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen * is used to open the bdev. */ void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags) { if (ctx->bdev) { struct request_queue *q = ctx->bdev->bd_queue; blkg_conf_exit(ctx); mutex_unlock(&q->elevator_lock); blk_mq_unfreeze_queue(q, memflags); } } static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] += src->bytes[i]; dst->ios[i] += src->ios[i]; } } static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] -= src->bytes[i]; dst->ios[i] -= src->ios[i]; } } static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, struct blkg_iostat *last) { struct blkg_iostat delta; unsigned long flags; /* propagate percpu delta to global */ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, cur); blkg_iostat_sub(&delta, last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(last, &delta); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) { struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); struct llist_node *lnode; struct blkg_iostat_set *bisc, *next_bisc; unsigned long flags; rcu_read_lock(); lnode = llist_del_all(lhead); if (!lnode) goto out; /* * For covering concurrent parent blkg update from blkg_release(). * * When flushing from cgroup, the subsystem rstat lock is always held, * so this lock won't cause contention most of time. */ raw_spin_lock_irqsave(&blkg_stat_lock, flags); /* * Iterate only the iostat_cpu's queued in the lockless list. */ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; struct blkg_iostat cur; unsigned int seq; /* * Order assignment of `next_bisc` from `bisc->lnode.next` in * llist_for_each_entry_safe and clearing `bisc->lqueued` for * avoiding to assign `next_bisc` with new next pointer added * in blk_cgroup_bio_start() in case of re-ordering. * * The pair barrier is implied in llist_add() in blk_cgroup_bio_start(). */ smp_mb(); WRITE_ONCE(bisc->lqueued, false); if (bisc == &blkg->iostat) goto propagate_up; /* propagate up to parent only */ /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); blkcg_iostat_update(blkg, &cur, &bisc->last); propagate_up: /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); /* * Queue parent->iostat to its blkcg's lockless * list to propagate up to the grandparent if the * iostat hasn't been queued yet. */ if (!parent->iostat.lqueued) { struct llist_head *plhead; plhead = per_cpu_ptr(parent->blkcg->lhead, cpu); llist_add(&parent->iostat.lnode, plhead); parent->iostat.lqueued = true; } } } raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); out: rcu_read_unlock(); } static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { /* Root-level stats are sourced from system-wide IO stats */ if (cgroup_parent(css->cgroup)) __blkcg_rstat_flush(css_to_blkcg(css), cpu); } /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate * flushing the root cgroup's stats by explicitly filling in the iostat * with disk level statistics. */ static void blkcg_fill_root_iostats(void) { struct class_dev_iter iter; struct device *dev; class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += cpu_dkstats->ios[STAT_WRITE]; tmp.ios[BLKG_IOSTAT_DISCARD] += cpu_dkstats->ios[STAT_DISCARD]; // convert sectors to bytes tmp.bytes[BLKG_IOSTAT_READ] += cpu_dkstats->sectors[STAT_READ] << 9; tmp.bytes[BLKG_IOSTAT_WRITE] += cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; } flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; const char *dname; unsigned seq; int i; if (!blkg->online) return; dname = blkg_dev_name(blkg); if (!dname) return; seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; rios = bis->cur.ios[BLKG_IOSTAT_READ]; wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; pol->pd_stat_fn(blkg->pd[i], s); } seq_puts(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); return 0; } static struct cftype blkcg_files[] = { { .name = "stat", .seq_show = blkcg_print_stat, }, { } /* terminate */ }; static struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, { } /* terminate */ }; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) { return &css_to_blkcg(css)->cgwb_list; } #endif /* * blkcg destruction is a three-stage process. * * 1. Destruction starts. The blkcg_css_offline() callback is invoked * which offlines writeback. Here we tie the next stage of blkg destruction * to the completion of writeback associated with the blkcg. This lets us * avoid punting potentially large amounts of outstanding writeback to root * while maintaining any ongoing policies. The next stage is triggered when * the nr_cgwbs count goes to zero. * * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called * and handles the destruction of blkgs. Here the css reference held by * the blkg is put back eventually allowing blkcg_css_free() to be called. * This work may occur in cgwb_release_workfn() on the cgwb_release * workqueue. Any submitted ios that fail to get the blkg ref will be * punted to the root_blkg. * * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. * This finally frees the blkcg. */ /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * * blkgs should be removed while holding both q and blkcg locks. As blkcg lock * is nested inside q lock, this function performs reverse double lock dancing. * Destroying the blkgs releases the reference held on the blkcg's css allowing * blkcg_css_free to eventually be called. * * This is the blkcg counterpart of ioc_release_fn(). */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; if (need_resched() || !spin_trylock(&q->queue_lock)) { /* * Given that the system can accumulate a huge number * of blkgs in pathological cases, check to see if we * need to rescheduling to avoid softlockup. */ spin_unlock_irq(&blkcg->lock); cond_resched(); spin_lock_irq(&blkcg->lock); continue; } blkg_destroy(blkg); spin_unlock(&q->queue_lock); } spin_unlock_irq(&blkcg->lock); } /** * blkcg_pin_online - pin online state * @blkcg_css: blkcg of interest * * While pinned, a blkcg is kept online. This is primarily used to * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline * while an associated cgwb is still active. */ void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) { refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); } /** * blkcg_unpin_online - unpin online state * @blkcg_css: blkcg of interest * * This is primarily used to impedance-match blkg and cgwb lifetimes so * that blkg doesn't go offline while an associated cgwb is still active. * When this count goes to zero, all active cgwbs have finished so the * blkcg can continue destruction by calling blkcg_destroy_blkgs(). */ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) { struct blkcg *blkcg = css_to_blkcg(blkcg_css); do { struct blkcg *parent; if (!refcount_dec_and_test(&blkcg->online_pin)) break; parent = blkcg_parent(blkcg); blkcg_destroy_blkgs(blkcg); blkcg = parent; } while (blkcg); } /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away. Here the cgwbs are * offlined first and only once writeback associated with the blkcg has * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(css); /* put the base online pin allowing step 2 to be triggered */ blkcg_unpin_online(css); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); int i; mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); mutex_unlock(&blkcg_pol_mutex); free_percpu(blkcg->lhead); kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; int i; mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) goto unlock; } if (init_blkcg_llists(blkcg)) goto free_blkcg; for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; /* * If the policy hasn't been attached yet, wait for it * to be attached before doing anything else. Otherwise, * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ if (!pol || !pol->cpd_alloc_fn) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto free_pd_blkcg; blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; } spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_percpu(blkcg->lhead); free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) { struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs * don't go offline while cgwbs are still active on them. Pin the * parent so that offline always happens towards the root. */ if (parent) blkcg_pin_online(&parent->css); return 0; } void blkg_init_queue(struct request_queue *q) { INIT_LIST_HEAD(&q->blkg_list); mutex_init(&q->blkcg_mutex); } int blkcg_init_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return 0; err_unlock: spin_unlock_irq(&q->queue_lock); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); blk_throtl_exit(disk); } static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_disk) put_disk(tsk->throttle_disk); tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled * together on the default hierarchy so that the owner cgroup can * be retrieved from writeback pages. */ .depends_on = 1 << memory_cgrp_id, #endif }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to activate * * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. * * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; unsigned int memflags; int ret; if (blkcg_policy_enabled(q, pol)) return 0; /* * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, * for example, ioprio. Such policy will work on blkcg level, not disk * level, and don't need to be activated. */ if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) return -EINVAL; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ if (blkg == pinned_blkg) { pd = pd_prealloc; pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { /* * GFP_NOWAIT failed. Free the existing one and * prealloc for @blkg w/ GFP_KERNEL. */ if (pinned_blkg) blkg_put(pinned_blkg); blkg_get(blkg); pinned_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, GFP_KERNEL); if (pd_prealloc) goto retry; else goto enomem; } spin_lock(&blkg->blkcg->lock); pd->blkg = blkg; pd->plid = pol->plid; blkg->pd[pol->plid] = pd; if (pol->pd_init_fn) pol->pd_init_fn(pd); if (pol->pd_online_fn) pol->pd_online_fn(pd); pd->online = true; spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; enomem: /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; struct blkg_policy_data *pd; spin_lock(&blkcg->lock); pd = blkg->pd[pol->plid]; if (pd) { if (pd->online && pol->pd_offline_fn) pol->pd_offline_fn(pd); pd->online = false; pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk * @disk: gendisk of interest * @pol: blkcg policy to deactivate * * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { struct request_queue *q = disk->queue; struct blkcg_gq *blkg; unsigned int memflags; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) memflags = blk_mq_freeze_queue(q); mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q, memflags); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); static void blkcg_free_all_cpd(struct blkcg_policy *pol) { struct blkcg *blkcg; list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); blkcg->cpd[pol->plid] = NULL; } } } /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register * * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ int blkcg_policy_register(struct blkcg_policy *pol) { struct blkcg *blkcg; int i, ret; /* * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy * without pd_alloc_fn/pd_free_fn can't be activated. */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) return -EINVAL; mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); ret = -ENOSPC; goto err_unlock; } /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { ret = -ENOMEM; goto err_free_cpds; } blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ if (pol->dfl_cftypes == pol->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); } else { WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); } mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); /** * blkcg_policy_unregister - unregister a blkcg policy * @pol: blkcg policy to unregister * * Undo blkcg_policy_register(@pol). Might sleep. */ void blkcg_policy_unregister(struct blkcg_policy *pol) { mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ if (pol->dfl_cftypes) cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); if (pol->cpd_free_fn) blkcg_free_all_cpd(pol); blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); out_unlock: mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a * while since we added delay, and when we are checking to see if we need to * delay a task, to account for any delays that may have occurred. */ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); /* negative use_delay means no scaling, see blkcg_set_delay() */ if (atomic_read(&blkg->use_delay) < 0) return; /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain * time window. We only want to throttle tasks for recent delay that * has occurred, in 1 second time windows since that's the maximum * things can be throttled. We save the current delay window in * blkg->last_delay so we know what amount is still left to be charged * to the blkg from this point onward. blkg->last_use keeps track of * the use_delay counter. The idea is if we're unthrottling the blkg we * are ok with whatever is happening now, and we can take away more of * the accumulated delay as we've already throttled enough that * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); /* * We've been unthrottled, subtract a larger chunk of our * accumulated delay. */ if (cur_use < blkg->last_use) sub = max_t(u64, sub, blkg->last_delay >> 1); /* * This shouldn't happen, but handle it anyway. Our delay_nsec * should only ever be growing except here where we subtract out * min(last_delay, 1 second), but lord knows bugs happen and I'd * rather not end up with negative numbers. */ if (unlikely(cur < sub)) { atomic64_set(&blkg->delay_nsec, 0); blkg->last_delay = 0; } else { atomic64_sub(sub, &blkg->delay_nsec); blkg->last_delay = cur - sub; } blkg->last_use = cur_use; } } /* * This is called when we want to actually walk up the hierarchy and check to * see if we need to throttle, and then actually throttle if there is some * accumulated delay. This should only be called upon return to user space so * we're not holding some lock that would induce a priority inversion. */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { int use_delay = atomic_read(&blkg->use_delay); if (use_delay) { u64 this_delay; blkcg_scale_delay(blkg, now); this_delay = atomic64_read(&blkg->delay_nsec); if (this_delay > delay_nsec) { delay_nsec = this_delay; clamp = use_delay > 0; } } blkg = blkg->parent; } if (!delay_nsec) return; /* * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the * delays at 0.25s. If there's 10's of seconds worth of delay then the * tasks will be delayed for 0.25 second for every syscall. If * blkcg_set_delay() was used as indicated by negative use_delay, the * caller is responsible for regulating the range. */ if (clamp) delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); do { __set_current_state(TASK_KILLABLE); if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); if (use_memdelay) psi_memstall_leave(&pflags); } /** * blkcg_maybe_throttle_current - throttle the current task if it has been marked * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; if (!disk) return; current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); put_disk(disk); return; out: rcu_read_unlock(); } /** * blkcg_schedule_throttle - this task needs to check for throttling * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * * We will only schedule once per syscall. You can call this over and over * again and it will only do the check once upon return to user space, and only * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; if (current->throttle_disk != disk) { if (test_bit(GD_DEAD, &disk->state)) return; get_device(disk_to_dev(disk)); if (current->throttle_disk) put_disk(current->throttle_disk); current->throttle_disk = disk; } if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } /** * blkcg_add_delay - add delay to this blkg * @blkg: blkg of interest * @now: the current time in nanoseconds * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio * @css: target css * * As the failure mode here is to walk up the blkg tree, this ensure that the * blkg->parent pointers are always valid. This returns the blkg that it ended * up taking a reference on or %NULL if no reference was taken. */ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; break; } blkg = blkg->parent; } rcu_read_unlock(); return ret_blkg; } /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg * and q->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio * * Associate @bio with the blkg found from the bio's css and request_queue. * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is * already associated, the css is reused and association redone as the * request_queue may have changed. */ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; if (blk_op_is_passthrough(bio->bi_opf)) return; rcu_read_lock(); if (bio->bi_blkg) css = bio_blkcg_css(bio); else css = blkcg_css(); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg); /** * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { if (src->bi_blkg) bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); static int blk_cgroup_io_type(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return BLKG_IOSTAT_DISCARD; if (op_is_write(bio->bi_opf)) return BLKG_IOSTAT_WRITE; return BLKG_IOSTAT_READ; } void blk_cgroup_bio_start(struct bio *bio) { struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) return; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split * bio and we would have already accounted for the size of the bio. */ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; } bis->cur.ios[rwd]++; /* * If the iostat_cpu isn't in a lockless list, put it into the * list to indicate that a stat update is pending. */ if (!READ_ONCE(bis->lqueued)) { struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); llist_add(&bis->lnode, lhead); WRITE_ONCE(bis->lqueued, true); } u64_stats_update_end_irqrestore(&bis->sync, flags); css_rstat_updated(&blkcg->css, cpu); put_cpu(); } bool blk_cgroup_congested(void) { struct blkcg *blkcg; bool ret = false; rcu_read_lock(); for (blkcg = css_to_blkcg(blkcg_css()); blkcg; blkcg = blkcg_parent(blkcg)) { if (atomic_read(&blkcg->congestion_count)) { ret = true; break; } } rcu_read_unlock(); return ret; } module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
269 271 270 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0 #include <linux/slab.h> #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/cpumask.h> #include <linux/export.h> #include <linux/memblock.h> #include <linux/numa.h> /* These are not inline because of header tangles. */ #ifdef CONFIG_CPUMASK_OFFSTACK /** * alloc_cpumask_var_node - allocate a struct cpumask on a given node * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * @node: memory node from which to allocate or %NUMA_NO_NODE * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>). * * Return: TRUE if memory allocation succeeded, FALSE otherwise. * * In addition, mask will be NULL if this fails. Note that gcc is * usually smart enough to know that mask can never be NULL if * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case * too. */ bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { *mask = kmalloc_node(cpumask_size(), flags, node); #ifdef CONFIG_DEBUG_PER_CPU_MAPS if (!*mask) { printk(KERN_ERR "=> alloc_cpumask_var: failed!\n"); dump_stack(); } #endif return *mask != NULL; } EXPORT_SYMBOL(alloc_cpumask_var_node); /** * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena. * @mask: pointer to cpumask_var_t where the cpumask is returned * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop (in <linux/cpumask.h>). * Either returns an allocated (zero-filled) cpumask, or causes the * system to panic. */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES); } /** * free_cpumask_var - frees memory allocated for a struct cpumask. * @mask: cpumask to free * * This is safe on a NULL mask. */ void free_cpumask_var(cpumask_var_t mask) { kfree(mask); } EXPORT_SYMBOL(free_cpumask_var); /** * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var * @mask: cpumask to free */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { memblock_free(mask, cpumask_size()); } #endif /** * cpumask_local_spread - select the i'th cpu based on NUMA distances * @i: index number * @node: local numa_node * * Return: online CPU according to a numa aware policy; local cpus are returned * first, followed by non-local ones, then it wraps around. * * For those who wants to enumerate all CPUs based on their NUMA distances, * i.e. call this function in a loop, like: * * for (i = 0; i < num_online_cpus(); i++) { * cpu = cpumask_local_spread(i, node); * do_something(cpu); * } * * There's a better alternative based on for_each()-like iterators: * * for_each_numa_hop_mask(mask, node) { * for_each_cpu_andnot(cpu, mask, prev) * do_something(cpu); * prev = mask; * } * * It's simpler and more verbose than above. Complexity of iterator-based * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while * cpumask_local_spread() when called for each cpu is * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)). */ unsigned int cpumask_local_spread(unsigned int i, int node) { unsigned int cpu; /* Wrap: we always want a cpu. */ i %= num_online_cpus(); cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node); WARN_ON(cpu >= nr_cpu_ids); return cpu; } EXPORT_SYMBOL(cpumask_local_spread); static DEFINE_PER_CPU(int, distribute_cpu_mask_prev); /** * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p. * @src1p: first &cpumask for intersection * @src2p: second &cpumask for intersection * * Iterated calls using the same srcp1 and srcp2 will be distributed within * their intersection. * * Return: >= nr_cpu_ids if the intersection is empty. */ unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = cpumask_next_and_wrap(prev, src1p, src2p); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_and_distribute); /** * cpumask_any_distribute - Return an arbitrary cpu from srcp * @srcp: &cpumask for selection * * Return: >= nr_cpu_ids if the intersection is empty. */ unsigned int cpumask_any_distribute(const struct cpumask *srcp) { unsigned int next, prev; /* NOTE: our first selection will skip 0. */ prev = __this_cpu_read(distribute_cpu_mask_prev); next = cpumask_next_wrap(prev, srcp); if (next < nr_cpu_ids) __this_cpu_write(distribute_cpu_mask_prev, next); return next; } EXPORT_SYMBOL(cpumask_any_distribute);
1787 1786 273 273 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // SPDX-License-Identifier: GPL-2.0 #include <linux/bug.h> #include <linux/export.h> #include <linux/types.h> #include <linux/mmdebug.h> #include <linux/mm.h> #include <asm/memory.h> phys_addr_t __virt_to_phys(unsigned long x) { WARN(!__is_lm_address(__tag_reset(x)), "virt_to_phys used for non-linear address: %p (%pS)\n", (void *)x, (void *)x); return __virt_to_phys_nodebug(x); } EXPORT_SYMBOL(__virt_to_phys); phys_addr_t __phys_addr_symbol(unsigned long x) { /* * This is bounds checking against the kernel image only. * __pa_symbol should only be used on kernel symbol addresses. */ VIRTUAL_BUG_ON(x < (unsigned long) KERNEL_START || x > (unsigned long) KERNEL_END); return __pa_symbol_nodebug(x); } EXPORT_SYMBOL(__phys_addr_symbol);
29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM net #if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NET_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/tracepoint.h> TRACE_EVENT(net_dev_start_xmit, TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), TP_ARGS(skb, dev), TP_STRUCT__entry( __string( name, dev->name ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( int, network_offset ) __field( bool, transport_offset_valid) __field( int, transport_offset) __field( u8, tx_flags ) __field( u16, gso_size ) __field( u16, gso_segs ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name); __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->network_offset = skb_network_offset(skb); __entry->transport_offset_valid = skb_transport_header_was_set(skb); __entry->transport_offset = skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : 0; __entry->tx_flags = skb_shinfo(skb)->tx_flags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, __entry->data_len, __entry->network_offset, __entry->transport_offset_valid, __entry->transport_offset, __entry->tx_flags, __entry->gso_size, __entry->gso_segs, __entry->gso_type) ); TRACE_EVENT(net_dev_xmit, TP_PROTO(struct sk_buff *skb, int rc, struct net_device *dev, unsigned int skb_len), TP_ARGS(skb, rc, dev, skb_len), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __field( int, rc ) __string( name, dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; __assign_str(name); ), TP_printk("dev=%s skbaddr=%p len=%u rc=%d", __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); TRACE_EVENT(net_dev_xmit_timeout, TP_PROTO(struct net_device *dev, int queue_index), TP_ARGS(dev, queue_index), TP_STRUCT__entry( __string( name, dev->name ) __string( driver, netdev_drivername(dev)) __field( int, queue_index ) ), TP_fast_assign( __assign_str(name); __assign_str(driver); __entry->queue_index = queue_index; ), TP_printk("dev=%s driver=%s queue=%d", __get_str(name), __get_str(driver), __entry->queue_index) ); DECLARE_EVENT_CLASS(net_dev_template, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __string( name, skb->dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; __assign_str(name); ), TP_printk("dev=%s skbaddr=%p len=%u", __get_str(name), __entry->skbaddr, __entry->len) ) DEFINE_EVENT(net_dev_template, net_dev_queue, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_receive_skb, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_rx, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __string( name, skb->dev->name ) __field( unsigned int, napi_id ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( u32, hash ) __field( bool, l4_hash ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( unsigned int, truesize ) __field( bool, mac_header_valid) __field( int, mac_header ) __field( unsigned char, nr_frags ) __field( u16, gso_size ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name); #ifdef CONFIG_NET_RX_BUSY_POLL __entry->napi_id = skb->napi_id; #else __entry->napi_id = 0; #endif __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->hash = skb->hash; __entry->l4_hash = skb->l4_hash; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->truesize = skb->truesize; __entry->mac_header_valid = skb_mac_header_was_set(skb); __entry->mac_header = skb_mac_header(skb) - skb->data; __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->hash, __entry->l4_hash, __entry->len, __entry->data_len, __entry->truesize, __entry->mac_header_valid, __entry->mac_header, __entry->nr_frags, __entry->gso_size, __entry->gso_type) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field(int, ret) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), TP_ARGS(ret) ); #endif /* _TRACE_NET_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/fs.h> #include <linux/path.h> #include <linux/slab.h> #include <linux/fs_struct.h> #include "internal.h" /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_root(struct fs_struct *fs, const struct path *path) { struct path old_root; path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_root = fs->root; fs->root = *path; write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_root.dentry) path_put(&old_root); } /* * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values. * It can block. */ void set_fs_pwd(struct fs_struct *fs, const struct path *path) { struct path old_pwd; path_get(path); spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); old_pwd = fs->pwd; fs->pwd = *path; write_seqcount_end(&fs->seq); spin_unlock(&fs->lock); if (old_pwd.dentry) path_put(&old_pwd); } static inline int replace_path(struct path *p, const struct path *old, const struct path *new) { if (likely(p->dentry != old->dentry || p->mnt != old->mnt)) return 0; *p = *new; return 1; } void chroot_fs_refs(const struct path *old_root, const struct path *new_root) { struct task_struct *g, *p; struct fs_struct *fs; int count = 0; read_lock(&tasklist_lock); for_each_process_thread(g, p) { task_lock(p); fs = p->fs; if (fs) { int hits = 0; spin_lock(&fs->lock); write_seqcount_begin(&fs->seq); hits += replace_path(&fs->root, old_root, new_root); hits += replace_path(&fs->pwd, old_root, new_root); write_seqcount_end(&fs->seq); while (hits--) { count++; path_get(new_root); } spin_unlock(&fs->lock); } task_unlock(p); } read_unlock(&tasklist_lock); while (count--) path_put(old_root); } void free_fs_struct(struct fs_struct *fs) { path_put(&fs->root); path_put(&fs->pwd); kmem_cache_free(fs_cachep, fs); } void exit_fs(struct task_struct *tsk) { struct fs_struct *fs = tsk->fs; if (fs) { int kill; task_lock(tsk); spin_lock(&fs->lock); tsk->fs = NULL; kill = !--fs->users; spin_unlock(&fs->lock); task_unlock(tsk); if (kill) free_fs_struct(fs); } } struct fs_struct *copy_fs_struct(struct fs_struct *old) { struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); /* We don't need to lock fs - think why ;-) */ if (fs) { fs->users = 1; fs->in_exec = 0; spin_lock_init(&fs->lock); seqcount_spinlock_init(&fs->seq, &fs->lock); fs->umask = old->umask; spin_lock(&old->lock); fs->root = old->root; path_get(&fs->root); fs->pwd = old->pwd; path_get(&fs->pwd); spin_unlock(&old->lock); } return fs; } int unshare_fs_struct(void) { struct fs_struct *fs = current->fs; struct fs_struct *new_fs = copy_fs_struct(fs); int kill; if (!new_fs) return -ENOMEM; task_lock(current); spin_lock(&fs->lock); kill = !--fs->users; current->fs = new_fs; spin_unlock(&fs->lock); task_unlock(current); if (kill) free_fs_struct(fs); return 0; } EXPORT_SYMBOL_GPL(unshare_fs_struct); int current_umask(void) { return current->fs->umask; } EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock), .umask = 0022, };
6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: * Fix for packet capture - Nick Eggleston <nick@dccinc.com>; * Add HW acceleration hooks - David S. Miller <davem@redhat.com>; * Correct all the locking - David S. Miller <davem@redhat.com>; * Use hash table for VLAN groups - David S. Miller <davem@redhat.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/uaccess.h> #include <linux/if_vlan.h> #include "vlan.h" #include "vlanproc.h" #define DRV_VERSION "1.8" /* Global VLAN variables */ unsigned int vlan_net_id __read_mostly; const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; /* End of global variables definitions. */ static int vlan_group_prealloc_vid(struct vlan_group *vg, __be16 vlan_proto, u16 vlan_id) { struct net_device **array; unsigned int vidx; unsigned int size; int pidx; ASSERT_RTNL(); pidx = vlan_proto_idx(vlan_proto); if (pidx < 0) return -EINVAL; vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN; array = vg->vlan_devices_arrays[pidx][vidx]; if (array != NULL) return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; /* paired with smp_rmb() in __vlan_group_get_device() */ smp_wmb(); vg->vlan_devices_arrays[pidx][vidx] = array; return 0; } static void vlan_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev, struct vlan_dev_priv *vlan) { if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_stacked_transfer_operstate(rootdev, dev); } void unregister_vlan_dev(struct net_device *dev, struct list_head *head) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct vlan_info *vlan_info; struct vlan_group *grp; u16 vlan_id = vlan->vlan_id; ASSERT_RTNL(); vlan_info = rtnl_dereference(real_dev->vlan_info); BUG_ON(!vlan_info); grp = &vlan_info->grp; grp->nr_vlan_devs--; if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_leave(dev); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_leave(dev); vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL); netdev_upper_dev_unlink(real_dev, dev); /* Because unregister_netdevice_queue() makes sure at least one rcu * grace period is respected before device freeing, * we dont need to call synchronize_net() here. */ unregister_netdevice_queue(dev, head); if (grp->nr_vlan_devs == 0) { vlan_mvrp_uninit_applicant(real_dev); vlan_gvrp_uninit_applicant(real_dev); } vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); } int vlan_check_real_dev(struct net_device *real_dev, __be16 protocol, u16 vlan_id, struct netlink_ext_ack *extack) { const char *name = real_dev->name; if (real_dev->features & NETIF_F_VLAN_CHALLENGED || real_dev->type != ARPHRD_ETHER) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; } if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) { NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists"); return -EEXIST; } return 0; } int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; u16 vlan_id = vlan->vlan_id; struct vlan_info *vlan_info; struct vlan_group *grp; int err; err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id); if (err) return err; vlan_info = rtnl_dereference(real_dev->vlan_info); /* vlan_info should be there now. vlan_vid_add took care of it */ BUG_ON(!vlan_info); grp = &vlan_info->grp; if (grp->nr_vlan_devs == 0) { err = vlan_gvrp_init_applicant(real_dev); if (err < 0) goto out_vid_del; err = vlan_mvrp_init_applicant(real_dev); if (err < 0) goto out_uninit_gvrp; } err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id); if (err < 0) goto out_uninit_mvrp; err = register_netdevice(dev); if (err < 0) goto out_uninit_mvrp; err = netdev_upper_dev_link(real_dev, dev, extack); if (err) goto out_unregister_netdev; vlan_stacked_transfer_operstate(real_dev, dev, vlan); linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */ /* So, got the sucker initialized, now lets place * it into our local structure. */ vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; return 0; out_unregister_netdev: unregister_netdevice(dev); out_uninit_mvrp: if (grp->nr_vlan_devs == 0) vlan_mvrp_uninit_applicant(real_dev); out_uninit_gvrp: if (grp->nr_vlan_devs == 0) vlan_gvrp_uninit_applicant(real_dev); out_vid_del: vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); return err; } /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns 0 if the device was created or a negative error code otherwise. */ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) { struct net_device *new_dev; struct vlan_dev_priv *vlan; struct net *net = dev_net(real_dev); struct vlan_net *vn = net_generic(net, vlan_net_id); char name[IFNAMSIZ]; int err; if (vlan_id >= VLAN_VID_MASK) return -ERANGE; err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id, NULL); if (err < 0) return err; /* Gotta set up the fields for the device. */ switch (vn->name_type) { case VLAN_NAME_TYPE_RAW_PLUS_VID: /* name will look like: eth1.0005 */ snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: vlan5 */ snprintf(name, IFNAMSIZ, "vlan%i", vlan_id); break; case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: /* Put our vlan.VID in the name. * Name will look like: eth0.5 */ snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id); break; case VLAN_NAME_TYPE_PLUS_VID: /* Put our vlan.VID in the name. * Name will look like: vlan0005 */ default: snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id); } new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name, NET_NAME_UNKNOWN, vlan_setup); if (new_dev == NULL) return -ENOBUFS; dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. */ new_dev->mtu = real_dev->mtu; vlan = vlan_dev_priv(new_dev); vlan->vlan_proto = htons(ETH_P_8021Q); vlan->vlan_id = vlan_id; vlan->real_dev = real_dev; vlan->dent = NULL; vlan->flags = VLAN_FLAG_REORDER_HDR; new_dev->rtnl_link_ops = &vlan_link_ops; err = register_vlan_dev(new_dev, NULL); if (err < 0) goto out_free_newdev; return 0; out_free_newdev: free_netdev(new_dev); return err; } static void vlan_sync_address(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); /* May be called without an actual change */ if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; /* vlan continues to inherit address of lower device */ if (vlan_dev_inherit_address(vlandev, dev)) goto out; /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_del(dev, vlandev->dev_addr); /* vlan address was equal to the old address and is different from * the new address */ if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } static void vlan_transfer_features(struct net_device *dev, struct net_device *vlandev) { struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev); netif_inherit_tso_max(vlandev, dev); if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto)) vlandev->hard_header_len = dev->hard_header_len; else vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN; #if IS_ENABLED(CONFIG_FCOE) vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid; #endif vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE; vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev); netdev_update_features(vlandev); } static int __vlan_device_event(struct net_device *dev, unsigned long event) { int err = 0; switch (event) { case NETDEV_CHANGENAME: vlan_proc_rem_dev(dev); err = vlan_proc_add_dev(dev); break; case NETDEV_REGISTER: err = vlan_proc_add_dev(dev); break; case NETDEV_UNREGISTER: vlan_proc_rem_dev(dev); break; } return err; } static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vlan_group *grp; struct vlan_info *vlan_info; int i, flgs; struct net_device *vlandev; struct vlan_dev_priv *vlan; bool last = false; LIST_HEAD(list); int err; if (is_vlan_dev(dev)) { int err = __vlan_device_event(dev, event); if (err) return notifier_from_errno(err); } if ((event == NETDEV_UP) && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) { pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name); vlan_vid_add(dev, htons(ETH_P_8021Q), 0); } if (event == NETDEV_DOWN && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) vlan_vid_del(dev, htons(ETH_P_8021Q), 0); vlan_info = rtnl_dereference(dev->vlan_info); if (!vlan_info) goto out; grp = &vlan_info->grp; /* It is OK that we do not hold the group lock right now, * as we run under the RTNL lock. */ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); break; case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan_sync_address(dev, vlandev); } break; case NETDEV_CHANGEMTU: vlan_group_for_each_dev(grp, i, vlandev) { if (vlandev->mtu <= dev->mtu) continue; dev_set_mtu(vlandev, dev->mtu); } break; case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ vlan_group_for_each_dev(grp, i, vlandev) vlan_transfer_features(dev, vlandev); break; case NETDEV_DOWN: { struct net_device *tmp; LIST_HEAD(close_list); /* Put all VLANs for this dev in the down state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = vlandev->flags; if (!(flgs & IFF_UP)) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) list_add(&vlandev->close_list, &close_list); } dev_close_many(&close_list, false); list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) { vlan_stacked_transfer_operstate(dev, vlandev, vlan_dev_priv(vlandev)); list_del_init(&vlandev->close_list); } list_del(&close_list); break; } case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ vlan_group_for_each_dev(grp, i, vlandev) { flgs = dev_get_flags(vlandev); if (flgs & IFF_UP) continue; vlan = vlan_dev_priv(vlandev); if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) dev_change_flags(vlandev, flgs | IFF_UP, extack); vlan_stacked_transfer_operstate(dev, vlandev, vlan); } break; case NETDEV_UNREGISTER: /* twiddle thumbs on netns device moves */ if (dev->reg_state != NETREG_UNREGISTERING) break; vlan_group_for_each_dev(grp, i, vlandev) { /* removal of last vid destroys vlan_info, abort * afterwards */ if (vlan_info->nr_vids == 1) last = true; unregister_vlan_dev(vlandev, &list); if (last) break; } unregister_netdevice_many(&list); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlaying device to change its type. */ if (vlan_uses_dev(dev)) return NOTIFY_BAD; break; case NETDEV_NOTIFY_PEERS: case NETDEV_BONDING_FAILOVER: case NETDEV_RESEND_IGMP: /* Propagate to vlan devices */ vlan_group_for_each_dev(grp, i, vlandev) call_netdevice_notifiers(event, vlandev); break; case NETDEV_CVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q)); if (err) return notifier_from_errno(err); break; case NETDEV_CVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q)); break; case NETDEV_SVLAN_FILTER_PUSH_INFO: err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD)); if (err) return notifier_from_errno(err); break; case NETDEV_SVLAN_FILTER_DROP_INFO: vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD)); break; } out: return NOTIFY_DONE; } static struct notifier_block vlan_notifier_block __read_mostly = { .notifier_call = vlan_device_event, }; /* * VLAN IOCTL handler. * o execute requested action or pass command to the device driver * arg is really a struct vlan_ioctl_args __user *. */ static int vlan_ioctl_handler(struct net *net, void __user *arg) { int err; struct vlan_ioctl_args args; struct net_device *dev = NULL; if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) return -EFAULT; /* Null terminate this sucker, just in case. */ args.device1[sizeof(args.device1) - 1] = 0; args.u.device2[sizeof(args.u.device2) - 1] = 0; rtnl_lock(); switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: case SET_VLAN_EGRESS_PRIORITY_CMD: case SET_VLAN_FLAG_CMD: case ADD_VLAN_CMD: case DEL_VLAN_CMD: case GET_VLAN_REALDEV_NAME_CMD: case GET_VLAN_VID_CMD: err = -ENODEV; dev = __dev_get_by_name(net, args.device1); if (!dev) goto out; err = -EINVAL; if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev)) goto out; } switch (args.cmd) { case SET_VLAN_INGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; vlan_dev_set_ingress_priority(dev, args.u.skb_priority, args.vlan_qos); err = 0; break; case SET_VLAN_EGRESS_PRIORITY_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_set_egress_priority(dev, args.u.skb_priority, args.vlan_qos); break; case SET_VLAN_FLAG_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = vlan_dev_change_flags(dev, args.vlan_qos ? args.u.flag : 0, args.u.flag); break; case SET_VLAN_NAME_TYPE_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) { struct vlan_net *vn; vn = net_generic(net, vlan_net_id); vn->name_type = args.u.name_type; err = 0; } else { err = -EINVAL; } break; case ADD_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = register_vlan_device(dev, args.u.VID); break; case DEL_VLAN_CMD: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; unregister_vlan_dev(dev, NULL); err = 0; break; case GET_VLAN_REALDEV_NAME_CMD: err = 0; vlan_dev_get_realdev_name(dev, args.u.device2, sizeof(args.u.device2)); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; case GET_VLAN_VID_CMD: err = 0; args.u.VID = vlan_dev_vlan_id(dev); if (copy_to_user(arg, &args, sizeof(struct vlan_ioctl_args))) err = -EFAULT; break; default: err = -EOPNOTSUPP; break; } out: rtnl_unlock(); return err; } static int __net_init vlan_init_net(struct net *net) { struct vlan_net *vn = net_generic(net, vlan_net_id); int err; vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; err = vlan_proc_init(net); return err; } static void __net_exit vlan_exit_net(struct net *net) { vlan_proc_cleanup(net); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit = vlan_exit_net, .id = &vlan_net_id, .size = sizeof(struct vlan_net), }; static int __init vlan_proto_init(void) { int err; pr_info("%s v%s\n", vlan_fullname, vlan_version); err = register_pernet_subsys(&vlan_net_ops); if (err < 0) goto err0; err = register_netdevice_notifier(&vlan_notifier_block); if (err < 0) goto err2; err = vlan_gvrp_init(); if (err < 0) goto err3; err = vlan_mvrp_init(); if (err < 0) goto err4; err = vlan_netlink_init(); if (err < 0) goto err5; vlan_ioctl_set(vlan_ioctl_handler); return 0; err5: vlan_mvrp_uninit(); err4: vlan_gvrp_uninit(); err3: unregister_netdevice_notifier(&vlan_notifier_block); err2: unregister_pernet_subsys(&vlan_net_ops); err0: return err; } static void __exit vlan_cleanup_module(void) { vlan_ioctl_set(NULL); vlan_netlink_fini(); unregister_netdevice_notifier(&vlan_notifier_block); unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ vlan_mvrp_uninit(); vlan_gvrp_uninit(); } module_init(vlan_proto_init); module_exit(vlan_cleanup_module); MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION);
304 60 158 311 2 310 7 546 60 60 60 57 546 306 335 4 1039 1036 60 340 105 57 1041 1036 297 1039 41 1039 337 337 570 1042 329 136 977 310 7 977 7 2 60 7 314 314 4 313 60 314 545 546 546 545 311 7 60 311 4 2 4 313 313 313 313 312 60 303 4 313 147 1040 336 1042 1039 304 297 34 977 657 315 314 315 314 307 307 41 307 306 306 307 277 307 306 307 82 82 81 329 74 80 306 257 82 60 60 4 60 60 60 60 303 303 303 303 60 60 4 60 60 60 14 14 1 47 2 32 47 47 19 1 60 60 60 15 59 60 60 59 60 60 60 54 310 60 60 60 60 60 60 60 60 60 74 60 58 7 45 37 4 31 40 29 4 31 1 25 8 5 56 4 45 18 4 51 51 51 337 336 97 143 337 4 4 4 4 34 17 28 34 34 60 34 27 20 20 19 1 33 4 60 34 60 60 2 60 60 60 60 60 60 4 60 60 60 59 7 60 60 60 60 60 60 60 60 60 60 60 20 27 33 1 27 20 30 7 34 996 998 996 973 762 998 998 994 995 7 34 34 4 34 34 34 34 34 34 34 34 34 7 29 34 33 1 27 20 34 30 13 57 57 57 57 4 57 53 10 57 57 4 56 57 57 57 57 57 57 52 57 42 14 25 35 25 14 43 43 57 57 14 25 43 57 13 57 136 136 136 8 128 135 136 136 297 270 28 303 20 24 336 337 337 28 337 337 54 54 28 28 28 28 18 2 17 2 19 3 1 8 11 227 295 295 295 295 295 27 28 28 9 19 28 28 261 222 74 303 74 310 311 247 244 2 311 10 51 307 311 303 74 10 10 2 8 10 46 66 7 4 108 5 190 4 42 41 1 42 58 333 58 164 53 42 10 311 28 58 304 304 20 334 73 241 28 57 2 13 313 136 297 337 304 337 190 334 113 336 331 76 57 42 311 320 320 320 303 147 8 29 20 20 20 20 20 20 315 314 36 36 36 35 9 36 36 314 324 323 51 316 316 325 36 325 325 325 41 56 545 314 545 4 545 545 546 546 545 546 546 314 640 639 637 548 546 64 639 545 640 545 638 640 637 546 544 20 20 20 20 20 20 20 246 227 678 225 225 20 20 20 20 20 20 20 20 4 4 4 4 4 4 4 4 129 128 4 4 129 7 128 297 297 296 289 289 9 288 95 289 312 312 312 303 304 304 336 337 335 99 99 312 296 153 295 296 127 40 40 325 325 10 10 325 325 132 132 544 546 255 546 545 1 226 546 546 319 294 295 295 294 295 9 9 8 8 5 20 20 321 322 251 128 26 696 695 695 700 37 151 695 37 696 314 314 314 314 314 314 314 314 314 314 313 314 314 314 314 314 314 314 314 313 314 314 314 314 314 314 314 314 314 314 314 313 314 313 314 314 314 314 313 314 314 314 311 314 314 314 313 313 312 314 314 314 314 314 314 314 314 314 314 314 314 314 314 314 314 314 314 314 314 314 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 // SPDX-License-Identifier: GPL-2.0+ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett <Liam.Howlett@oracle.com> * Matthew Wilcox <willy@infradead.org> * Copyright (c) 2023 ByteDance * Author: Peng Zhang <zhangpeng.00@bytedance.com> */ /* * DOC: Interesting implementation details of the Maple Tree * * Each node type has a number of slots for entries and a number of slots for * pivots. In the case of dense nodes, the pivots are implied by the position * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges. Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * * * The following illustrates the layout of a range64 nodes slots and pivots. * * * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ * │ │ │ │ │ │ │ │ └─ Implied maximum * │ │ │ │ │ │ │ └─ Pivot 14 * │ │ │ │ │ │ └─ Pivot 13 * │ │ │ │ │ └─ Pivot 12 * │ │ │ │ └─ Pivot 11 * │ │ │ └─ Pivot 2 * │ │ └─ Pivot 1 * │ └─ Pivot 0 * └─ Implied minimum * * Slot contents: * Internal (non-leaf) nodes contain pointers to other nodes. * Leaf nodes contain entries. * * The location of interest is often referred to as an offset. All offsets have * a slot, but the last offset has an implied pivot from the node above (or * UINT_MAX for the root node. * * Ranges complicate certain write activities. When modifying any of * the B-tree variants, it is known that one entry will either be added or * deleted. When modifying the Maple Tree, one store operation may overwrite * the entire data set, or one half of the tree, or the middle half of the tree. * */ #include <linux/maple_tree.h> #include <linux/xarray.h> #include <linux/types.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/limits.h> #include <asm/barrier.h> #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. * * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is * permissible to bypass this. Otherwise remain cautious and retain the hashing. * * Userland doesn't know about %px so also use %p there. */ #if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) #define PTR_FMT "%px" #else #define PTR_FMT "%p" #endif #define MA_ROOT_PARENT 1 /* * Maple state flags * * MA_STATE_BULK - Bulk insert mode * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ #define MA_STATE_BULK 1 #define MA_STATE_REBALANCE 2 #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; #ifdef CONFIG_DEBUG_MAPLE_TREE static const unsigned long mt_max[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif static const unsigned char mt_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] static const unsigned char mt_pivots[] = { [maple_dense] = 0, [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] static const unsigned char mt_min_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS / 2, [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] #define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; struct { unsigned long padding[MAPLE_BIG_NODE_GAPS]; unsigned long gap[MAPLE_BIG_NODE_GAPS]; }; }; unsigned char b_end; enum maple_type type; }; /* * The maple_subtree_state is used to build a tree to replace a segment of an * existing tree in a more atomic way. Any walkers of the older tree will hit a * dead node and restart on updates. */ struct maple_subtree_state { struct ma_state *orig_l; /* Original left side of subtree */ struct ma_state *orig_r; /* Original right side of subtree */ struct ma_state *l; /* New left side of subtree */ struct ma_state *m; /* New middle of subtree (rare) */ struct ma_state *r; /* New right side of subtree */ struct ma_topiary *free; /* nodes to be freed */ struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ struct maple_big_node *bn; }; #ifdef CONFIG_KASAN_STACK /* Prevent mas_wr_bnode() from exceeding the stack frame limit */ #define noinline_for_kasan noinline_for_stack #else #define noinline_for_kasan inline #endif /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_one(struct maple_node *node) { kmem_cache_free(maple_node_cache, node); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } static void mt_free_rcu(struct rcu_head *head) { struct maple_node *node = container_of(head, struct maple_node, rcu); kmem_cache_free(maple_node_cache, node); } /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free * * The maple tree uses the parent pointer to indicate this node is no longer in * use and will be freed. */ static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } static void mt_set_height(struct maple_tree *mt, unsigned char height) { unsigned int new_flags = mt->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; MT_BUG_ON(mt, height > MAPLE_HEIGHT_MAX); new_flags |= height << MT_FLAGS_HEIGHT_OFFSET; mt->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) { return mt_height(mas->tree); } static inline unsigned int mt_attr(struct maple_tree *mt) { return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } static __always_inline enum maple_type mte_node_type( const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } /* * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); } static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); mas->status = ma_error; } static __always_inline bool mas_is_ptr(const struct ma_state *mas) { return mas->status == ma_root; } static __always_inline bool mas_is_start(const struct ma_state *mas) { return mas->status == ma_start; } static __always_inline bool mas_is_none(const struct ma_state *mas) { return mas->status == ma_none; } static __always_inline bool mas_is_paused(const struct ma_state *mas) { return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { return mas->status == ma_overflow; } static inline bool mas_is_underflow(struct ma_state *mas) { return mas->status == ma_underflow; } static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mte_to_mat() - Convert a maple encoded node to a maple topiary node. * @entry: The maple encoded node * * Return: a maple topiary pointer */ static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) { return (struct maple_topiary *) ((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mas_mn() - Get the maple state node. * @mas: The maple state * * Return: the maple node (not encoded - bare pointer). */ static inline struct maple_node *mas_mn(const struct ma_state *mas) { return mte_to_node(mas->node); } /* * mte_set_node_dead() - Set a maple encoded node as dead. * @mn: The maple encoded node. */ static inline void mte_set_node_dead(struct maple_enode *mn) { mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); smp_wmb(); /* Needed for RCU */ } /* Bit 1 indicates the root is a node */ #define MAPLE_ROOT_NODE 0x02 /* maple_type stored bit 3-6 */ #define MAPLE_ENODE_TYPE_SHIFT 0x03 /* Bit 2 means a NULL somewhere below */ #define MAPLE_ENODE_NULL 0x04 static inline struct maple_enode *mt_mk_node(const struct maple_node *node, enum maple_type type) { return (void *)((unsigned long)node | (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); } static inline void *mte_safe_root(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } static inline bool mas_is_root_limits(const struct ma_state *mas) { return !mas->min && mas->max == ULONG_MAX; } static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } /* * The Parent Pointer * Excluding root, the parent pointer is 256B aligned like all other tree nodes. * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 * bit values need an extra bit to store the offset. This extra bit comes from * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * * Note types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 * 0b010 : 32 bit values, type in 0-2, slot in 3-7 * 0b110 : 64 bit values, type in 0-2, slot in 3-7 */ #define MAPLE_PARENT_ROOT 0x01 #define MAPLE_PARENT_SLOT_SHIFT 0x03 #define MAPLE_PARENT_SLOT_MASK 0xF8 #define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 #define MAPLE_PARENT_RANGE32 0x04 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* * mte_parent_shift() - Get the parent shift for the slot storage. * @parent: The parent pointer cast as an unsigned long * Return: The shift into that pointer to the star to of the slot */ static inline unsigned long mte_parent_shift(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_SHIFT; return MAPLE_PARENT_16B_SLOT_SHIFT; } /* * mte_parent_slot_mask() - Get the slot mask for the parent. * @parent: The parent pointer cast as an unsigned long. * Return: The slot mask for that parent. */ static inline unsigned long mte_parent_slot_mask(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_MASK; return MAPLE_PARENT_16B_SLOT_MASK; } /* * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; p_type = (unsigned long)mte_to_node(enode)->parent; if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) return 0; p_type &= MAPLE_NODE_MASK; p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } return 0; } /* * mas_set_parent() - Set the parent node and encode the slot * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. * * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the * parent type. */ static inline void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); MAS_BUG_ON(mas, p_type == maple_dense); MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: case maple_arange_64: shift = MAPLE_PARENT_SLOT_SHIFT; type = MAPLE_PARENT_RANGE64; break; default: case maple_dense: case maple_leaf_64: shift = type = 0; break; } val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ val |= (slot << shift) | type; mte_to_node(enode)->parent = ma_parent_ptr(val); } /* * mte_parent_slot() - get the parent slot of @enode. * @enode: The encoded maple node. * * Return: The slot in the parent node where @enode resides. */ static __always_inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; if (unlikely(val & MA_ROOT_PARENT)) return 0; /* * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT */ return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); } /* * mte_parent() - Get the parent of @node. * @enode: The encoded maple node. * * Return: The parent maple node. */ static __always_inline struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); } /* * ma_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *node; node = mte_to_node(enode); return ma_dead_node(node); } /* * mas_allocated() - Get the number of nodes allocated in a maple state. * @mas: The maple state * * The ma_state alloc member is overloaded to hold a pointer to the first * allocated node or to the number of requested nodes to allocate. If bit 0 is * set, then the alloc contains the number of requested nodes. If there is an * allocated node, then the total allocated nodes is in that node. * * Return: The total number of nodes allocated */ static inline unsigned long mas_allocated(const struct ma_state *mas) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) return 0; return mas->alloc->total; } /* * mas_set_alloc_req() - Set the requested number of allocations. * @mas: the maple state * @count: the number of allocations. * * The requested number of allocations is either in the first allocated node, * located in @mas->alloc->request_count, or directly in @mas->alloc if there is * no allocated node. Set the request either in the node or do the necessary * encoding to store in @mas->alloc directly. */ static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { if (!count) mas->alloc = NULL; else mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); return; } mas->alloc->request_count = count; } /* * mas_alloc_req() - get the requested number of allocations. * @mas: The maple state * * The alloc count is either stored directly in @mas, or in * @mas->alloc->request_count if there is at least one node allocated. Decode * the request count if it's stored directly in @mas->alloc. * * Return: The allocation request count. */ static inline unsigned int mas_alloc_req(const struct ma_state *mas) { if ((unsigned long)mas->alloc & 0x1) return (unsigned long)(mas->alloc) >> 1; else if (mas->alloc) return mas->alloc->request_count; return 0; } /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node * @type: the node type * * In the event of a dead node, this array may be %NULL * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.pivot; case maple_range_64: case maple_leaf_64: return node->mr64.pivot; case maple_dense: return NULL; } return NULL; } /* * ma_gaps() - Get a pointer to the maple node gaps. * @node: the maple node * @type: the node type * * Return: A pointer to the maple node gaps */ static inline unsigned long *ma_gaps(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.gap; case maple_range_64: case maple_leaf_64: case maple_dense: return NULL; } return NULL; } /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @piv: The pivot to fetch * @type: The maple node type * * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { if (piv >= mt_pivots[type]) return mas->max; return pivots[piv]; } /* * mas_safe_min() - Return the minimum for a given offset. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @offset: The offset into the pivot array * * Return: The minimum range value that is contained in @offset. */ static inline unsigned long mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) { if (likely(offset)) return pivots[offset - 1] + 1; return mas->min; } /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node * @piv: The pivot offset * @val: The value of the pivot */ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, unsigned long val) { struct maple_node *node = mte_to_node(mn); enum maple_type type = mte_node_type(mn); BUG_ON(piv >= mt_pivots[type]); switch (type) { case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; break; case maple_arange_64: node->ma64.pivot[piv] = val; break; case maple_dense: break; } } /* * ma_slots() - Get a pointer to the maple node slots. * @mn: The maple node * @mt: The maple node type * * Return: A pointer to the maple node slots */ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return mn->ma64.slot; case maple_range_64: case maple_leaf_64: return mn->mr64.slot; case maple_dense: return mn->slot; } return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } static __always_inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset. */ static __always_inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } /* * mas_slot() - Get the slot value when not holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset */ static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot(mas->tree, slots, offset); } /* * mas_root() - Get the maple tree root. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } static inline void *mt_root_locked(struct maple_tree *mt) { return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* * mas_root_locked() - Get the maple tree root when holding the maple tree lock. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static inline void *mas_root_locked(struct ma_state *mas) { return mt_root_locked(mas->tree); } static inline struct maple_metadata *ma_meta(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return &mn->ma64.meta; default: return &mn->mr64.meta; } } /* * ma_set_meta() - Set the metadata information of a node. * @mn: The maple node * @mt: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, unsigned char offset, unsigned char end) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; meta->end = end; } /* * mt_clear_meta() - clear the metadata information of a node, if it exists * @mt: The maple tree * @mn: The maple node * @type: The maple node type */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; next = mt_slot_locked(mt, slots, MAPLE_RANGE64_SLOTS - 1); if (unlikely((mte_to_node(next) && mte_node_type(next)))) return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: meta = ma_meta(mn, type); break; default: return; } meta->gap = 0; meta->end = 0; } /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type */ static inline unsigned char ma_meta_end(struct maple_node *mn, enum maple_type mt) { struct maple_metadata *meta = ma_meta(mn, mt); return meta->end; } /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node */ static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, unsigned char offset) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; } /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. * @mat: the ma_topiary, a linked list of dead nodes. * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ static inline void mat_add(struct ma_topiary *mat, struct maple_enode *dead_enode) { mte_set_node_dead(dead_enode); mte_to_mat(dead_enode)->next = NULL; if (!mat->tail) { mat->tail = mat->head = dead_enode; return; } mte_to_mat(mat->tail)->next = dead_enode; mat->tail = dead_enode; } static void mt_free_walk(struct rcu_head *head); static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas: the maple state * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; struct maple_node *node; bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; node = mte_to_node(mat->head); mt_destroy_walk(mat->head, mas->tree, !in_rcu); if (in_rcu) call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } /* * mas_descend() - Descend into the slot stored in the ma_state. * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ static inline void mas_descend(struct ma_state *mas) { enum maple_type type; unsigned long *pivots; struct maple_node *node; void __rcu **slots; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); slots = ma_slots(node, type); if (mas->offset) mas->min = pivots[mas->offset - 1] + 1; mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); mas->node = mas_slot(mas, slots, mas->offset); } /* * mte_set_gap() - Set a maple node gap. * @mn: The encoded maple node * @gap: The offset of the gap to set * @val: The gap value */ static inline void mte_set_gap(const struct maple_enode *mn, unsigned char gap, unsigned long val) { switch (mte_node_type(mn)) { default: break; case maple_arange_64: mte_to_node(mn)->ma64.gap[gap] = val; break; } } /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * * Sets the @mas->max and @mas->min to the correct values when walking up. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise */ static int mas_ascend(struct ma_state *mas) { struct maple_enode *p_enode; /* parent enode. */ struct maple_enode *a_enode; /* ancestor enode. */ struct maple_node *a_node; /* ancestor node. */ struct maple_node *p_node; /* parent node. */ unsigned char a_slot; enum maple_type a_type; unsigned long min, max; unsigned long *pivots; bool set_max = false, set_min = false; a_node = mas_mn(mas); if (ma_is_root(a_node)) { mas->offset = 0; return 0; } p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; a_type = mas_parent_type(mas, mas->node); mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ if (p_node != mte_parent(mas->node)) return 1; mas->node = a_enode; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; mas->min = 0; return 0; } min = 0; max = ULONG_MAX; if (!mas->offset) { min = mas->min; set_min = true; } if (mas->max == ULONG_MAX) set_max = true; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); pivots = ma_pivots(a_node, a_type); if (unlikely(ma_dead_node(a_node))) return 1; if (!set_min && a_slot) { set_min = true; min = pivots[a_slot - 1] + 1; } if (!set_max && a_slot < mt_pivots[a_type]) { set_max = true; max = pivots[a_slot]; } if (unlikely(ma_dead_node(a_node))) return 1; if (unlikely(ma_is_root(a_node))) break; } while (!set_min || !set_max); mas->max = max; mas->min = min; return 0; } /* * mas_pop_node() - Get a previously allocated maple node from the maple state. * @mas: The maple state * * Return: A pointer to a maple node. */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ if (WARN_ON(!total)) return NULL; if (total == 1) { /* single allocation in this ma_state */ mas->alloc = NULL; ret = node; goto single_node; } if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; mas->alloc->total = node->total - 1; ret = node; goto new_head; } node->total--; ret = node->slot[--node->node_count]; node->slot[node->node_count] = NULL; single_node: new_head: if (req) { req++; mas_set_alloc_req(mas, req); } memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } /* * mas_push_node() - Push a node back on the maple state allocation. * @mas: The maple state * @used: The used maple node * * Stores the maple node back into @mas->alloc for reuse. Updates allocated and * requested node count as necessary. */ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) { struct maple_alloc *reuse = (struct maple_alloc *)used; struct maple_alloc *head = mas->alloc; unsigned long count; unsigned int requested = mas_alloc_req(mas); count = mas_allocated(mas); reuse->request_count = 0; reuse->node_count = 0; if (count) { if (head->node_count < MAPLE_ALLOC_SLOTS) { head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->slot[0] = head; reuse->node_count = 1; } reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) mas_set_alloc_req(mas, requested - 1); } /* * mas_alloc_nodes() - Allocate nodes into a maple state * @mas: The maple state * @gfp: The GFP Flags */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; unsigned int max_req = 0; if (!requested) return; mas_set_alloc_req(mas, 0); if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; WARN_ON(!allocated); } if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; if (allocated) { node->slot[0] = mas->alloc; node->node_count = 1; } else { node->node_count = 0; } mas->alloc = node; node->total = ++allocated; node->request_count = 0; requested--; } node = mas->alloc; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; if (node->node_count == 0) { node->slot[0]->node_count = 0; node->slot[0]->request_count = 0; } node->node_count += count; allocated += count; /* find a non-full node*/ do { node = node->slot[0]; } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; return; nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); mas_set_err(mas, -ENOMEM); } /* * mas_free() - Free an encoded maple node * @mas: The maple state * @used: The encoded maple node to free. * * Uses rcu free if necessary, pushes @used back on the maple state allocations * otherwise. */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { struct maple_node *tmp = mte_to_node(used); if (mt_in_rcu(mas->tree)) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_node_count_gfp() - Check if enough nodes are allocated and request more * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags */ static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) { unsigned long allocated = mas_allocated(mas); if (allocated < count) { mas_set_alloc_req(mas, count - allocated); mas_alloc_nodes(mas, gfp); } } /* * mas_node_count() - Check if enough nodes are allocated and request more if * there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. */ static void mas_node_count(struct ma_state *mas, int count) { return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); } /* * mas_start() - Sets up maple state for operations. * @mas: The maple state. * * If mas->status == ma_start, then set the min, max and depth to * defaults. * * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == ma_root * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { if (likely(mas_is_start(mas))) { struct maple_enode *root; mas->min = 0; mas->max = ULONG_MAX; retry: mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 0; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) goto retry; return NULL; } mas->node = NULL; /* empty tree */ if (unlikely(!root)) { mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ if (mas->index > 0) return NULL; return root; } return NULL; } /* * ma_data_end() - Find the end of the data in a node. * @node: The maple node * @type: The maple node type * @pivots: The array of pivots in the node * @max: The maximum value in the node * * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ static __always_inline unsigned char ma_data_end(struct maple_node *node, enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; if (!pivots) return 0; if (type == maple_arange_64) return ma_meta_end(node, type); offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == max)) return offset; return mt_pivots[type]; } /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. * * Return: The zero indexed last slot with data (may be null). */ static inline unsigned char mas_data_end(struct ma_state *mas) { enum maple_type type; struct maple_node *node; unsigned char offset; unsigned long *pivots; type = mte_node_type(mas->node); node = mas_mn(mas); if (type == maple_arange_64) return ma_meta_end(node, type); pivots = ma_pivots(node, type); if (unlikely(ma_dead_node(node))) return 0; offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == mas->max)) return offset; return mt_pivots[type]; } /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state * * Return: The maximum gap in the leaf. */ static unsigned long mas_leaf_max_gap(struct ma_state *mas) { enum maple_type mt; unsigned long pstart, gap, max_gap; struct maple_node *mn; unsigned long *pivots; void __rcu **slots; unsigned char i; unsigned char max_piv; mt = mte_node_type(mas->node); mn = mas_mn(mas); slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; for (i = 0; i < mt_slots[mt]; i++) { if (slots[i]) { if (gap > max_gap) max_gap = gap; gap = 0; } else { gap++; } } if (gap > max_gap) max_gap = gap; return max_gap; } /* * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { max_gap = pivots[0] - mas->min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; if (max_gap > pivots[max_piv] - mas->min) return max_gap; } for (; i <= max_piv; i++) { /* data == no gap. */ if (likely(slots[i])) continue; pstart = pivots[i - 1]; gap = pivots[i] - pstart; if (gap > max_gap) max_gap = gap; /* There cannot be two gaps in a row. */ i++; } return max_gap; } /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * * Return: The maximum gap value */ static inline unsigned long ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, unsigned char *off) { unsigned char offset, i; unsigned long max_gap = 0; i = offset = ma_meta_end(node, mt); do { if (gaps[i] > max_gap) { max_gap = gaps[i]; offset = i; } } while (i--); *off = offset; return max_gap; } /* * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) { unsigned long *gaps; unsigned char offset; enum maple_type mt; struct maple_node *node; mt = mte_node_type(mas->node); if (ma_is_leaf(mt)) return mas_leaf_max_gap(mas); node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } /* * mas_parent_gap() - Set the parent gap and any gaps above, as needed * @mas: The maple state * @offset: The gap offset in the parent to set * @new: The new gap value. * * Set the parent gap then continue to set the gap upwards, using the metadata * of the parent to see if it is necessary to check the node above. */ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, unsigned long new) { unsigned long meta_gap = 0; struct maple_node *pnode; struct maple_enode *penode; unsigned long *pgaps; unsigned char meta_offset; enum maple_type pmt; pnode = mte_parent(mas->node); pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; if (meta_gap == new) return; if (offset != meta_offset) { if (meta_gap > new) return; ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } if (ma_is_root(pnode)) return; /* Go to the parent node. */ pnode = mte_parent(penode); pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); goto ascend; } /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { unsigned char pslot; unsigned long p_gap; unsigned long max_gap; if (!mt_is_alloc(mas->tree)) return; if (mte_is_root(mas->node)) return; max_gap = mas_max_gap(mas); pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); } /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. * @mas: the maple state (for the tree) * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; unsigned char offset; offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); mas_set_parent(mas, child, parent, offset); } while (offset--); } /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. * @new_height: if we are inserting a root node, update the height of the tree */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode, char new_height) __must_hold(mas->tree->ma_lock) { unsigned char offset; void __rcu **slots; if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mt_set_height(mas->tree, new_height); } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } mte_set_node_dead(old_enode); } /* * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. * @new_height: The new height of the tree as a result of the operation */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode, unsigned char new_height) __must_hold(mas->tree->ma_lock) { mas_put_in_tree(mas, old_enode, new_height); mas_free(mas, old_enode); } /* * mas_find_child() - Find a child who has the parent @mas->node. * @mas: the maple state with the parent. * @child: the maple state to store the child. */ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; unsigned char offset; unsigned char end; unsigned long *pivots; struct maple_enode *entry; struct maple_node *node; void __rcu **slots; mt = mte_node_type(mas->node); node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); end = ma_data_end(node, mt, pivots, mas->max); for (offset = mas->offset; offset <= end; offset++) { entry = mas_slot_locked(mas, slots, offset); if (mte_parent(entry) == node) { *child = *mas; mas->offset = offset + 1; child->offset = offset; mas_descend(child); child->offset = 0; return true; } } return false; } /* * mab_shift_right() - Shift the data in mab right. Note, does not clean out the * old data or set b_node->b_end. * @b_node: the maple_big_node * @shift: the shift count */ static inline void mab_shift_right(struct maple_big_node *b_node, unsigned char shift) { unsigned long size = b_node->b_end * sizeof(unsigned long); memmove(b_node->pivot + shift, b_node->pivot, size); memmove(b_node->slot + shift, b_node->slot, size); if (b_node->type == maple_arange_64) memmove(b_node->gap + shift, b_node->gap, size); } /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * * Return: true if a middle node is required. */ static inline bool mab_middle_node(struct maple_big_node *b_node, int split, unsigned char slot_count) { unsigned char size = b_node->b_end; if (size >= 2 * slot_count) return true; if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) return true; return false; } /* * mab_no_null_split() - ensure the split doesn't fall on a NULL * @b_node: the maple_big_node with the data * @split: the suggested split location * @slot_count: the number of slots in the node being considered. * * Return: the split location. */ static inline int mab_no_null_split(struct maple_big_node *b_node, unsigned char split, unsigned char slot_count) { if (!b_node->slot[split]) { /* * If the split is less than the max slot && the right side will * still be sufficient, then increment the split on NULL. */ if ((split < slot_count - 1) && (b_node->b_end - split) > (mt_min_slots[b_node->type])) split++; else split--; } return split; } /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, struct maple_big_node *bn, unsigned char *mid_split) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ unsigned char slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. */ if (unlikely((mas->mas_flags & MA_STATE_BULK))) { *mid_split = 0; split = b_end - mt_min_slots[bn->type]; if (!ma_is_leaf(bn->type)) return split; mas->mas_flags |= MA_STATE_REBALANCE; if (!bn->slot[split]) split--; return split; } /* * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can * also be located in different parent nodes which are also full. This can * carry upwards all the way to the root in the worst case. */ if (unlikely(mab_middle_node(bn, split, slot_count))) { split = b_end / 3; *mid_split = split * 2; } else { *mid_split = 0; } /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); if (unlikely(*mid_split)) *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } /* * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node * and set @b_node->b_end to the next free slot. * @mas: The maple state * @mas_start: The starting slot to copy * @mas_end: The end slot to copy (inclusively) * @b_node: The maple_big_node to place the data * @mab_start: The starting location in maple_big_node to store the data. */ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, unsigned char mas_end, struct maple_big_node *b_node, unsigned char mab_start) { enum maple_type mt; struct maple_node *node; void __rcu **slots; unsigned long *pivots, *gaps; int i = mas_start, j = mab_start; unsigned char piv_end; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); if (!i) { b_node->pivot[j] = pivots[i++]; if (unlikely(i > mas_end)) goto complete; j++; } piv_end = min(mas_end, mt_pivots[mt]); for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; } b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; j -= mab_start; slots = ma_slots(node, mt); memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { gaps = ma_gaps(node, mt); memcpy(b_node->gap + mab_start, gaps + mas_start, sizeof(unsigned long) * j); } } /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node * @mt: The maple type * @end: The node end */ static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } /* * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. * @b_node: the maple_big_node that has the data * @mab_start: the start location in @b_node. * @mab_end: The end location in @b_node (inclusively) * @mas: The maple state with the maple encoded node. */ static inline void mab_mas_cp(struct maple_big_node *b_node, unsigned char mab_start, unsigned char mab_end, struct ma_state *mas, bool new_max) { int i, j = 0; enum maple_type mt = mte_node_type(mas->node); struct maple_node *node = mte_to_node(mas->node); void __rcu **slots = ma_slots(node, mt); unsigned long *pivots = ma_pivots(node, mt); unsigned long *gaps = NULL; unsigned char end; if (mab_end - mab_start > mt_pivots[mt]) mab_end--; if (!pivots[mt_pivots[mt] - 1]) slots[mt_pivots[mt]] = NULL; i = mab_start; do { pivots[j++] = b_node->pivot[i++]; } while (i <= mab_end && likely(b_node->pivot[i])); memcpy(slots, b_node->slot + mab_start, sizeof(void *) * (i - mab_start)); if (new_max) mas->max = b_node->pivot[i - 1]; end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; unsigned char offset = 0; gaps = ma_gaps(node, mt); do { gaps[--j] = b_node->gap[--i]; if (gaps[j] > max_gap) { offset = j; max_gap = gaps[j]; } } while (j); ma_set_meta(node, mt, offset, end); } else { mas_leaf_set_meta(node, mt, end); } } /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state * @end: The maple node end * @mt: The maple node type */ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, enum maple_type mt) { if (!(mas->mas_flags & MA_STATE_BULK)) return; if (mte_is_root(mas->node)) return; if (end > mt_min_slots[mt]) { mas->mas_flags &= ~MA_STATE_REBALANCE; return; } } /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. * @wr_mas: the maple write state * @b_node: the maple_big_node to fill with data * @offset_end: the offset to end copying * * Return: The actual end of the data stored in @b_node */ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; unsigned char b_end; /* Possible underflow of piv will wrap back to 0 before use. */ unsigned long piv; struct ma_state *mas = wr_mas->mas; b_node->type = wr_mas->type; b_end = 0; slot = mas->offset; if (slot) { /* Copy start data up to insert. */ mas_mab_cp(mas, 0, slot - 1, b_node, 0); b_end = b_node->b_end; piv = b_node->pivot[b_end - 1]; } else piv = mas->min - 1; if (piv + 1 < mas->index) { /* Handle range starting after old range */ b_node->slot[b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = mas->index - 1 - piv; b_node->pivot[b_end++] = mas->index - 1; } /* Store the new entry. */ mas->offset = b_end; b_node->slot[b_end] = wr_mas->entry; b_node->pivot[b_end] = mas->last; /* Appended. */ if (mas->last >= mas->max) goto b_end; /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); b_node->slot[++b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = piv - mas->last + 1; b_node->pivot[b_end] = piv; } slot = offset_end + 1; if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; b_end: b_node->b_end = b_end; } /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state * * Return: True if there is a previous sibling, false otherwise. */ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; mas_ascend(mas); mas->offset = p_slot - 1; mas_descend(mas); return true; } /* * mas_next_sibling() - Find the next node with the same parent. * @mas: the maple state * * Return: true if there is a next sibling, false otherwise. */ static inline bool mas_next_sibling(struct ma_state *mas) { MA_STATE(parent, mas->tree, mas->index, mas->last); if (mte_is_root(mas->node)) return false; parent = *mas; mas_ascend(&parent); parent.offset = mte_parent_slot(mas->node) + 1; if (parent.offset > mas_data_end(&parent)) return false; *mas = parent; mas_descend(mas); return true; } /* * mas_node_or_none() - Set the enode and state. * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. */ static inline void mas_node_or_none(struct ma_state *mas, struct maple_enode *enode) { if (enode) { mas->node = enode; mas->status = ma_active; } else { mas->node = NULL; mas->status = ma_none; } } /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * If @mas->index cannot be found within the containing * node, we traverse to the last entry in the node. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. */ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; mas->offset = mas->index = mas->min; return; } wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) offset++; wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; mast->l->offset += end; } /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; } while (--depth); mast_rebalance_next(mast); *mast->orig_l = l_tmp; return true; } else if (mast->orig_l->offset != 0) { mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); } while (--depth); mast_rebalance_prev(mast); *mast->orig_r = r_tmp; return true; } } while (!mte_is_root(mast->orig_r->node)); *mast->orig_r = r_tmp; *mast->orig_l = l_tmp; return false; } /* * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * * Ascend the original left and right sides. Set the offsets to point to the * data already in the new tree (@mast->l and @mast->r). */ static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); mast->bn->type = wr_mas.type; } /* * mas_new_ma_node() - Create and return a new maple node. Helper function. * @mas: the maple state with the allocations. * @b_node: the maple_big_node with the type encoding. * * Use the node type from the maple_big_node to allocate a new node from the * ma_state. This function exists mainly for code readability. * * Return: A new maple encoded node */ static inline struct maple_enode *mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) { return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); } /* * mas_mab_to_node() - Set up right and middle nodes * * @mas: the maple state that contains the allocations. * @b_node: the node which contains the data. * @left: The pointer which will have the left node * @right: The pointer which may have the right node * @middle: the pointer which may have the middle node (rare) * @mid_split: the split location for the middle node * * Return: the split of left. */ static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, unsigned char *mid_split) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; *left = mas_new_ma_node(mas, b_node); *right = NULL; *middle = NULL; *mid_split = 0; if (b_node->b_end < slot_count) { split = b_node->b_end; } else { split = mab_calc_split(mas, b_node, mid_split); *right = mas_new_ma_node(mas, b_node); } if (*mid_split) *middle = mas_new_ma_node(mas, b_node); return split; } /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. * @b_node: the big node to add the entry * @mas: the maple state to get the pivot (mas->max) * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, void *entry) { if (!entry) return; b_node->slot[b_node->b_end] = entry; if (mt_is_alloc(mas->tree)) b_node->gap[b_node->b_end] = mas_max_gap(mas); b_node->pivot[b_node->b_end++] = mas->max; } /* * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * * @mas: the maple state with the node that needs a parent * @left: possible parent 1 * @right: possible parent 2 * @slot: the slot the mas->node was placed * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, struct maple_enode *right, unsigned char *slot, unsigned char split) { if (mas_is_none(mas)) return; if ((*slot) <= split) mas_set_parent(mas, mas->node, left, *slot); else if (right) mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } /* * mte_mid_split_check() - Check if the next node passes the mid-split * @l: Pointer to left encoded maple node. * @m: Pointer to middle encoded maple node. * @r: Pointer to right encoded maple node. * @slot: The offset * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, struct maple_enode **r, struct maple_enode *right, unsigned char slot, unsigned char *split, unsigned char mid_split) { if (*r == right) return; if (slot < mid_split) return; *l = *r; *r = right; *split = mid_split; } /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. * @mast: the maple subtree state * @left: the left node * @right: the right node * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { unsigned char slot; struct maple_enode *l = left; struct maple_enode *r = right; if (mas_is_none(mast->l)) return; if (middle) r = middle; slot = mast->l->offset; mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->l, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->m, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->r, l, r, &slot, split); } /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; struct maple_enode *enode; if (mas_is_none(tmp_mas)) return; enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_topiary_replace() - Replace the data with new data, then repair the * parent links within the new tree. Iterate over the dead sub-tree and collect * the dead subtrees and topiary the nodes that are no longer of use. * * The new tree will have up to three children with the correct parent. Keep * track of the new entries as they need to be followed to find the next level * of new entries. * * The old tree will have up to three children with the old parent. Keep track * of the old entries as they may have more nodes below replaced. Nodes within * [index, last] are dead subtrees, others need to be freed and followed. * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced * @new_height: The new height of the tree as a result of the operation * */ static inline void mas_topiary_replace(struct ma_state *mas, struct maple_enode *old_enode, unsigned char new_height) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); bool in_rcu; int i, n; /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode, new_height); /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; tmp[1].status = ma_none; tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; n++; } mas_adopt_children(&tmp[i], tmp[i].node); } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; } /* Collect the old nodes that need to be discarded */ if (mte_is_leaf(old_enode)) return mas_free(mas, old_enode); tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; tmp[1].status = ma_none; tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); tmp_next[n].status = ma_none; } else { n++; } } } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. * @new_height: The new height of the tree as a result of the operation * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, struct maple_enode *old_enode, unsigned char new_height) { /* Insert the new data in the tree */ mas_topiary_replace(mas, old_enode, new_height); if (mte_is_leaf(mas->node)) return; mas_update_gap(mas); } /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state * @left: The left encoded maple node * @middle: The middle encoded maple node * @right: The right encoded maple node * @split: The location to split between left and (middle ? middle : right) * @mid_split: The location to split between middle and right. */ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { bool new_lmax = true; mas_node_or_none(mast->l, left); mas_node_or_none(mast->m, middle); mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { mast->l->max = mast->orig_r->max; new_lmax = false; } mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); if (middle) { mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); mast->m->min = mast->bn->pivot[split] + 1; split = mid_split; } mast->r->max = mast->orig_r->max; if (right) { mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); mast->r->min = mast->bn->pivot[split] + 1; } } /* * mast_combine_cp_left - Copy in the original left side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_left(struct maple_subtree_state *mast) { unsigned char l_slot = mast->orig_l->offset; if (!l_slot) return; mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); } /* * mast_combine_cp_right: Copy in the original right side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_right(struct maple_subtree_state *mast) { if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) return; mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, mt_slot_count(mast->orig_r->node), mast->bn, mast->bn->b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_sufficient: Check if the maple subtree state has enough data in the big * node to create at least one sufficient node * @mast: the maple subtree state */ static inline bool mast_sufficient(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) return true; return false; } /* * mast_overflow: Check if there is too much data in the subtree state for a * single node. * @mast: The maple subtree state */ static inline bool mast_overflow(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_slot_count(mast->orig_l->node)) return true; return false; } static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next, *last; enum maple_type type; void __rcu **slots; unsigned char end; unsigned long max, min; unsigned long prev_max, prev_min; next = mas->node; min = mas->min; max = mas->max; do { last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); prev_min = min; prev_max = max; if (pivots[0] >= mas->index) { offset = 0; max = pivots[0]; goto next; } offset = 1; while (offset < end) { if (pivots[offset] >= mas->index) { max = pivots[offset]; break; } offset++; } min = pivots[offset - 1] + 1; next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; mas->min = prev_min; mas->max = prev_max; mas->node = last; return (void *)next; dead_node: mas_reset(mas); return NULL; } /* * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. * @mas: The starting maple state * @mast: The maple_subtree_state, keeps track of 4 maple states. * @count: The estimated count of iterations needed. * * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root * is hit. First @b_node is split into two entries which are inserted into the * next iteration of the loop. @b_node is returned populated with the final * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last * to account of what has been copied into the new sub-tree. The update of * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. */ static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; unsigned char slot = 0; unsigned char new_height = 0; /* used if node is a new root */ struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid * rippling up the tree to limit the amount of churn. Once a new sub-section of * the tree is created, there may be a mix of new and old nodes. The old nodes * will have the incorrect parent pointers and currently be in two trees: the * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, &mid_split); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); new_height++; /* * Copy data from next level in the tree to mast->bn from next * iteration */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); mab_set_b_end(mast->bn, &m_mas, middle); mab_set_b_end(mast->bn, &r_mas, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) { if (mast_overflow(mast)) continue; if (mast->orig_l->node == mast->orig_r->node) { /* * The data in b_node should be stored in one * node and in the tree */ slot = mast->l->offset; break; } continue; } /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) break; mast_spanning_rebalance(mast); /* rebalancing from other nodes may require another loop. */ if (!count) count++; } l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); new_height++; mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; mas->depth = l_mas.depth; mas->node = l_mas.node; mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode, new_height); mtree_range_walk(mas); return; } /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state * @b_node: The big maple node. * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. */ static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); struct maple_subtree_state mast; unsigned char shift, b_end = ++b_node->b_end; MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced * against the node to the right if it exists, otherwise the node to the * left of this node is rebalanced against this node. If rebalancing * causes just one node to be produced instead of two, then the parent * is also examined and rebalanced if it is insufficient. Every level * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; if (mas_next_sibling(&r_mas)) { mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); r_mas.last = r_mas.index = r_mas.max; } else { mas_prev_sibling(&l_mas); shift = mas_data_end(&l_mas) + 1; mab_shift_right(b_node, shift); mas->offset += shift; mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); b_node->b_end = shift + b_end; l_mas.index = l_mas.last = l_mas.min; } return mas_spanning_rebalance(mas, &mast, empty_count); } /* * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple * state. * @mas: The maple state * @end: The end of the left-most node. * * During a mass-insert event (such as forking), it may be necessary to * rebalance the left-most node when it is not sufficient. */ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); unsigned char new_height = mas_mt_height(mas); MA_STATE(l_mas, mas->tree, mas->index, mas->last); l_mas = *mas; mas_prev_sibling(&l_mas); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { newnode = &reuse; } node = mas_mn(mas); newnode->parent = node->parent; slots = ma_slots(newnode, mt); pivs = ma_pivots(newnode, mt); left = mas_mn(&l_mas); l_slots = ma_slots(left, mt); l_pivs = ma_pivots(left, mt); if (!l_slots[split]) split++; tmp = mas_data_end(&l_mas) - split; memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); pivs[tmp] = l_mas.max; memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { unsigned char max_p = mt_pivots[mt]; unsigned char max_s = mt_slots[mt]; if (tmp < max_p) memset(pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ tmp = split + 1; memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); eparent = old_eparent; goto done; } /* RCU requires replacing both l_mas, mas, and parent. */ mas->node = mt_mk_node(newnode, mt); ma_set_meta(newnode, mt, 0, tmp); new_left = mas_pop_node(mas); new_left->parent = left->parent; mt = mte_node_type(l_mas.node); slots = ma_slots(new_left, mt); pivs = ma_pivots(new_left, mt); memcpy(slots, l_slots, sizeof(void *) * split); memcpy(pivs, l_pivs, sizeof(unsigned long) * split); ma_set_meta(new_left, mt, 0, split); l_mas.node = mt_mk_node(new_left, mt); /* replace parent. */ offset = mte_parent_slot(mas->node); mt = mas_parent_type(&l_mas, l_mas.node); parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; eparent = mt_mk_node(parent, mt); done: gap = mas_leaf_max_gap(mas); mte_set_gap(eparent, mte_parent_slot(mas->node), gap); gap = mas_leaf_max_gap(&l_mas); mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); if (in_rcu) { mas_replace_node(mas, old_eparent, new_height); mas_adopt_children(mas, mas->node); } mas_update_gap(mas); } /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state */ static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas) { struct maple_enode *ancestor; if (mte_is_root(mas->node)) { if (mt_is_alloc(mas->tree)) mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; } /* * Only a single node is used here, could be root. * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; } /* * mast_fill_bnode() - Copy data into the big node in the subtree state * @mast: The maple subtree state * @mas: the maple state * @skip: The number of entries to skip for new nodes insertion. */ static inline void mast_fill_bnode(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char skip) { bool cp = true; unsigned char split; memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; } else { mas_ascend(mas); mas->offset = mte_parent_slot(mas->node); } if (cp && mast->l->offset) mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); split = mast->bn->b_end; mab_set_b_end(mast->bn, mast->l, mast->l->node); mast->r->offset = mast->bn->b_end; mab_set_b_end(mast->bn, mast->r, mast->r->node); if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) cp = false; if (cp) mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, mast->bn, mast->bn->b_end); mast->bn->b_end--; mast->bn->type = mte_node_type(mas->node); } /* * mast_split_data() - Split the data in the subtree state big node into regular * nodes. * @mast: The maple subtree state * @mas: The maple state * @split: The location to split the big node */ static inline void mast_split_data(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char split) { unsigned char p_slot; mab_mas_cp(mast->bn, 0, split, mast->l, true); mte_set_pivot(mast->r->node, 0, mast->r->max); mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); mast->l->offset = mte_parent_slot(mas->node); mast->l->max = mast->bn->pivot[split]; mast->r->min = mast->l->max + 1; if (mte_is_leaf(mas->node)) return; p_slot = mast->orig_l->offset; mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, &p_slot, split); mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, &p_slot, split); } /* * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state * @mast: The maple subtree state * @left: Push left or not. * * Keeping the height of the tree low means faster lookups. * * Return: True if pushed, false otherwise. */ static inline bool mas_push_data(struct ma_state *mas, struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); tmp_mas = *mas; tmp_mas.depth = mast->l->depth; if (left && !mas_prev_sibling(&tmp_mas)) return false; else if (!left && !mas_next_sibling(&tmp_mas)) return false; end = mas_data_end(&tmp_mas); slot_total += end; space = 2 * mt_slot_count(mas->node) - 2; /* -2 instead of -1 to ensure there isn't a triple split */ if (ma_is_leaf(mast->bn->type)) space--; if (mas->max == ULONG_MAX) space--; if (slot_total >= space) return false; /* Get the data; Fill mast->bn */ mast->bn->b_end++; if (left) { mab_shift_right(mast->bn, end + 1); mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); mast->bn->b_end = slot_total + 1; } else { mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); } /* Configure mast for splitting of mast->bn */ split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; } split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); /* Update parent slot for split calculation. */ if (left) mast->orig_l->offset += end + 1; mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); mas_split_final_node(mast, mas); return true; } /* * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node */ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; unsigned int orig_height = mas_mt_height(mas); unsigned char mid_split, split = 0; struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple * Tree splits upwards. Splitting up means that the split operation * occurs when the walk of the tree hits the leaves and not on the way * down. The reason for splitting up is that it is impossible to know * how much space will be needed until the leaf is (or leaves are) * reached. Since overwriting data is allowed and a range could * overwrite more than one range or result in changing one entry into 3 * entries, it is impossible to know if a split is required until the * data is examined. * * Splitting is a balancing act between keeping allocations to a minimum * and avoiding a 'jitter' event where a tree is expanded to make room * for an entry followed by a contraction when the entry is removed. To * accomplish the balance, there are empty slots remaining in both left * and right nodes after a split. */ MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); mast.l = &l_mas; mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; mast.bn = b_node; while (height++ <= orig_height) { if (mt_slots[b_node->type] > b_node->b_end) { mas_split_final_node(&mast, mas); break; } l_mas = r_mas = *mas; l_mas.node = mas_new_ma_node(mas, b_node); r_mas.node = mas_new_ma_node(mas, b_node); /* * Another way that 'jitter' is avoided is to terminate a split up early if the * left or right node has space to spare. This is referred to as "pushing left" * or "pushing right" and is similar to the B* tree, except the nodes left or * right can rarely be reused due to RCU, but the ripple upwards is halted which * is a significant savings. */ /* Try to push left. */ if (mas_push_data(mas, &mast, true)) { height++; break; } /* Try to push right. */ if (mas_push_data(mas, &mast, false)) { height++; break; } split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites * r->max. */ mast.r->max = mas->max; mast_fill_bnode(&mast, mas, 1); prev_l_mas = *mast.l; prev_r_mas = *mast.r; } /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; mas_wmb_replace(mas, old, height); mtree_range_walk(mas); return; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node */ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node) { enum store_type type = wr_mas->mas->store_type; WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); if (type == wr_rebalance) return mas_rebalance(wr_mas->mas, b_node); return mas_split(wr_mas->mas, b_node); } /* * mas_root_expand() - Expand a root to a node * @mas: The maple state * @entry: The entry to store into the tree */ static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; int slot = 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; if (mas->index) { if (contents) { rcu_assign_pointer(slots[slot], contents); if (likely(mas->index > 1)) slot++; } pivots[slot++] = mas->index - 1; } rcu_assign_pointer(slots[slot], entry); mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; mt_set_height(mas->tree, 1); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); return; } /* * mas_store_root() - Storing value into root. * @mas: The maple state * @entry: The entry to store. * * There is no root node now and we are storing a value into the root - this * function either assigns the pointer or expands into a node. */ static inline void mas_store_root(struct ma_state *mas, void *entry) { if (!entry) { if (!mas->index) rcu_assign_pointer(mas->tree->ma_root, NULL); } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; } } /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. * * Return: True if this is a spanning write, false otherwise. */ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) { unsigned long max = wr_mas->r_max; unsigned long last = wr_mas->mas->last; enum maple_type type = wr_mas->type; void *entry = wr_mas->entry; /* Contained in this pivot, fast path */ if (last < max) return false; if (ma_is_leaf(type)) { max = wr_mas->mas->max; if (last < max) return false; } if (last == max) { /* * The last entry of leaf node cannot be NULL unless it is the * rightmost node (writing ULONG_MAX), otherwise it spans slots. */ if (entry || last == ULONG_MAX) return false; } trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); return true; } static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) { wr_mas->type = mte_node_type(wr_mas->mas->node); mas_wr_node_walk(wr_mas); wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); } static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) { wr_mas->mas->max = wr_mas->r_max; wr_mas->mas->min = wr_mas->r_min; wr_mas->mas->node = wr_mas->content; wr_mas->mas->offset = 0; wr_mas->mas->depth++; } /* * mas_wr_walk() - Walk the tree for a write. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. * * Return: True if it's contained in a node, false on spanning write. */ static bool mas_wr_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); if (unlikely(mas_is_span_wr(wr_mas))) return false; wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; if (mas->end < mt_slots[wr_mas->type] - 1) wr_mas->vacant_height = mas->depth + 1; if (ma_is_root(mas_mn(mas))) { /* root needs more than 2 entries to be sufficient + 1 */ if (mas->end > 2) wr_mas->sufficient_height = 1; } else if (mas->end > mt_min_slots[wr_mas->type] + 1) wr_mas->sufficient_height = mas->depth + 1; mas_wr_walk_traverse(wr_mas); } return true; } static void mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return; mas_wr_walk_traverse(wr_mas); } } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * @l_wr_mas: The left maple write state * @r_wr_mas: The right maple write state */ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct ma_state *r_mas = r_wr_mas->mas; struct ma_state *l_mas = l_wr_mas->mas; unsigned char l_slot; l_slot = l_mas->offset; if (!l_wr_mas->content) l_mas->index = l_wr_mas->r_min; if ((l_mas->index == l_wr_mas->r_min) && (l_slot && !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { if (l_slot > 1) l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; else l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; } if (!r_wr_mas->content) { if (r_mas->last < r_wr_mas->r_max) r_mas->last = r_wr_mas->r_max; r_mas->offset++; } else if ((r_mas->last == r_wr_mas->r_max) && (r_mas->last < r_mas->max) && !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; } } static inline void *mas_state_walk(struct ma_state *mas) { void *entry; entry = mas_start(mas); if (mas_is_none(mas)) return NULL; if (mas_is_ptr(mas)) return entry; return mtree_range_walk(mas); } /* * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up * to date. * * @mas: The maple state. * * Note: Leaves mas in undesirable state. * Return: The entry for @mas->index or %NULL on dead node. */ static inline void *mtree_lookup_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next; enum maple_type type; void __rcu **slots; unsigned char end; next = mas->node; do { node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = mt_pivots[type]; offset = 0; do { if (pivots[offset] >= mas->index) break; } while (++offset < end); slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); return (void *)next; dead_node: mas_reset(mas); return NULL; } static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. * @mas: The maple state * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX */ static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { mt_set_height(mas->tree, 0); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; } node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mt_set_height(mas->tree, 1); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state */ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; struct ma_state *mas; unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning * store and is handled early in the store call stack by the function * mas_is_span_wr(). When a spanning store is identified, the maple * state is duplicated. The first maple state walks the left tree path * to ``index``, the duplicate walks the right tree path to ``last``. * The data in the two nodes are combined into a single node, two nodes, * or possibly three nodes (see the 3-way split above). A ``NULL`` * written to the last entry of a node is considered a spanning store as * a rebalance is required for the operation to complete and an overflow * of data may happen. */ mas = wr_mas->mas; trace_ma_op(__func__, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); /* * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ height = mas_mt_height(mas); /* * Set up right side. Need to get to the next offset after the spanning * store to ensure it's not NULL and to combine both the next node and * the node with the start together. */ r_mas = *mas; /* Avoid overflow, walk to next slot in the tree. */ if (r_mas.last + 1) r_mas.last++; r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; /* Set up left side. */ l_mas = *mas; mas_wr_walk_index(&l_wr_mas); if (!wr_mas->entry) { mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); mas->offset = l_mas.offset; mas->index = l_mas.index; mas->last = l_mas.last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ if (!l_mas.index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node if there is anything to copy. */ if (r_mas.max > r_mas.last) mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ l_mas.index = l_mas.last = mas->index; mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } /* * mas_wr_node_store() - Attempt to store the value in a node * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. */ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **dst_slots; unsigned long *dst_pivots; unsigned char dst_offset, offset_end = wr_mas->offset_end; struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); unsigned char height = mas_mt_height(mas); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); newnode = &reuse; } newnode->parent = mas_mn(mas)->parent; dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; } /* Store the new entry and range end. */ if (mas->offset < node_pivots) dst_pivots[mas->offset] = mas->last; rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry); /* * this range wrote to the end of the node or it overwrote the rest of * the data */ if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); if (new_end < node_pivots) dst_pivots[new_end] = mas->max; done: mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace_node(mas, old_enode, height); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state */ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; void __rcu **slots = wr_mas->slots; bool gap = false; gap |= !mt_slot_locked(mas->tree, slots, offset); gap |= !mt_slot_locked(mas->tree, slots, offset + 1); if (wr_mas->offset_end - offset == 1) { if (mas->index == wr_mas->r_min) { /* Overwriting the range and a part of the next one */ rcu_assign_pointer(slots[offset], wr_mas->entry); wr_mas->pivots[offset] = mas->last; } else { /* Overwriting a part of the range and the next one */ rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } } else { WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges */ gap |= !mt_slot_locked(mas->tree, slots, offset + 2); rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } trace_ma_write(__func__, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. */ if (!wr_mas->entry || gap) mas_update_gap(mas); return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!wr_mas->slots[wr_mas->offset_end]) { /* If this one is null, the next and prev are not */ mas->last = wr_mas->end_piv; } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; wr_mas->end_piv = mas->last; } } if (!wr_mas->content) { /* If this one is null, the next and prev are not */ mas->index = wr_mas->r_min; } else { /* Check prev slot if we are overwriting the start */ if (mas->index == wr_mas->r_min && mas->offset && !wr_mas->slots[mas->offset - 1]) { mas->offset--; wr_mas->r_min = mas->index = mas_safe_min(mas, wr_mas->pivots, mas->offset); wr_mas->r_max = wr_mas->pivots[mas->offset]; } } } static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) new_end--; if (wr_mas->end_piv == mas->last) new_end--; return new_end; } /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. */ static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **slots; unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } slots = wr_mas->slots; if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return; } /* * mas_wr_bnode() - Slow path for a modification. * @wr_mas: The write maple state * * This is where split, rebalance end up. */ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); } /* * mas_wr_store_entry() - Internal call to store a value * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) mas_update_gap(mas); break; case wr_append: mas_wr_append(wr_mas, new_end); break; case wr_slot_store: mas_wr_slot_store(wr_mas); break; case wr_node_store: mas_wr_node_store(wr_mas, new_end); break; case wr_spanning_store: mas_wr_spanning_store(wr_mas); break; case wr_split_store: case wr_rebalance: mas_wr_bnode(wr_mas); break; case wr_new_root: mas_new_root(mas, wr_mas->entry); break; case wr_store_root: mas_store_root(mas, wr_mas->entry); break; case wr_invalid: MT_BUG_ON(mas->tree, 1); } return; } static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!mas_is_active(mas)) { if (mas_is_start(mas)) goto set_content; if (unlikely(mas_is_paused(mas))) goto reset; if (unlikely(mas_is_none(mas))) goto reset; if (unlikely(mas_is_overflow(mas))) goto reset; if (unlikely(mas_is_underflow(mas))) goto reset; } /* * A less strict version of mas_is_span_wr() where we allow spanning * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ if (mas->last > mas->max) goto reset; if (wr_mas->entry) goto set_content; if (mte_is_leaf(mas->node) && mas->last == mas->max) goto reset; goto set_content; reset: mas_reset(mas); set_content: wr_mas->content = mas_start(mas); } /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration * @wr_mas: The maple write state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; unsigned char height = mas_mt_height(mas); int ret = height * 3 + 1; unsigned char delta = height - wr_mas->vacant_height; switch (mas->store_type) { case wr_exact_fit: case wr_append: case wr_slot_store: ret = 0; break; case wr_spanning_store: if (wr_mas->sufficient_height < wr_mas->vacant_height) ret = (height - wr_mas->sufficient_height) * 3 + 1; else ret = delta * 3 + 1; break; case wr_split_store: ret = delta * 2 + 1; break; case wr_rebalance: if (wr_mas->sufficient_height < wr_mas->vacant_height) ret = (height - wr_mas->sufficient_height) * 2 + 1; else ret = delta * 2 + 1; break; case wr_node_store: ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_new_root: ret = 1; break; case wr_store_root: if (likely((mas->last != 0) || (mas->index != 0))) ret = 1; else if (((unsigned long) (entry) & 3) == 2) ret = 1; else ret = 0; break; case wr_invalid: WARN_ON_ONCE(1); } return ret; } /* * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state * * Return: the type of store needed for the operation */ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return wr_store_root; if (unlikely(!mas_wr_walk(wr_mas))) return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); if (!wr_mas->entry) mas_wr_extend_null(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) return wr_rebalance; return wr_node_store; } if (new_end >= mt_slots[wr_mas->type]) return wr_split_store; if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || (wr_mas->offset_end - mas->offset == 1))) return wr_slot_store; return wr_node_store; } /** * mas_wr_preallocate() - Preallocate enough nodes for a store operation * @wr_mas: The maple write state * @entry: The entry that will be stored * */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { int request; mas_wr_prealloc_setup(wr_mas); wr_mas->mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(wr_mas, entry); if (!request) return; mas_node_count(wr_mas->mas, request); } /** * mas_insert() - Internal call to insert a value * @mas: The maple state * @entry: The entry to store * * Return: %NULL or the contents that already exists at the requested index * otherwise. The maple state needs to be checked for error conditions. */ static inline void *mas_insert(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); /* * Inserting a new range inserts either 0, 1, or 2 pivots within the * tree. If the insert fits exactly into an existing gap with a value * of NULL, then the slot only needs to be written with the new value. * If the range being inserted is adjacent to another range, then only a * single pivot needs to be inserted (as well as writing the entry). If * the new range is within a gap but does not touch any other ranges, * then two pivots need to be inserted: the start - 1, and the end. As * usual, the entry must be written. Most operations require a new node * to be allocated and replace an existing node to ensure RCU safety, * when in RCU mode. The exception to requiring a newly allocated node * is when inserting at the end of a node (appending). When done * carefully, appending can reuse the node in place. */ wr_mas.content = mas_start(mas); if (wr_mas.content) goto exists; mas_wr_preallocate(&wr_mas, entry); if (mas_is_err(mas)) return NULL; /* spanning writes always overwrite something */ if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { wr_mas.offset_end = mas->offset; wr_mas.end_piv = wr_mas.r_max; if (wr_mas.content || (mas->last > wr_mas.r_max)) goto exists; } mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: mas_set_err(mas, -EEXIST); return wr_mas.content; } /** * mas_alloc_cyclic() - Internal call to find somewhere to store an entry * @mas: The maple state. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, or -EBUSY if there are no * free entries. */ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { unsigned long min = range_lo; int ret = 0; range_lo = max(min, *next); ret = mas_empty_area(mas, range_lo, range_hi, 1); if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && range_lo > min) { mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; } if (ret < 0) return ret; do { mas_insert(mas, entry); } while (mas_nomem(mas, gfp)); if (mas_is_err(mas)) return xa_err(mas->node); *startp = mas->index; *next = *startp + 1; if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; } static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { mas_rewalk(mas, index); return true; } return false; } /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or the status will be * ma_none. * @mas: The maple state * @min: The lower limit to search * * The prev node value will be mas->node[mas->offset] or the status will be * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; void __rcu **slots; struct maple_node *node; unsigned long *pivots; unsigned long max; node = mas_mn(mas); if (!mas->min) goto no_entry; max = mas->min - 1; if (max < min) goto no_entry; level = 0; do { if (ma_is_root(node)) goto no_entry; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; offset = mas->offset; level++; node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); while (level > 1) { level--; slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; mt = mte_node_type(mas->node); node = mas_mn(mas); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; } slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; if (likely(offset)) mas->min = pivots[offset - 1] + 1; mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; mas->end = mas->offset; return 0; no_entry: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_underflow; return 0; } /* * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state * @min: The minimum starting range * @empty: Can be empty * * Return: The entry in the previous slot which is possibly NULL */ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; unsigned long pivot; enum maple_type type; unsigned long *pivots; struct maple_node *node; unsigned long save_point = mas->index; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->min <= min) { pivot = mas_safe_min(mas, pivots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot <= min) goto underflow; } again: if (likely(mas->offset)) { mas->offset--; mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { if (mas->index <= min) goto underflow; if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_underflow(mas))) return NULL; mas->last = mas->max; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->index = pivots[mas->offset - 1] + 1; } slots = ma_slots(node, type); entry = mas_slot(mas, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (likely(entry)) return entry; if (!empty) { if (mas->index <= min) { mas->status = ma_underflow; return NULL; } goto again; } return entry; underflow: mas->status = ma_underflow; return NULL; } /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { unsigned long min; unsigned long *pivots; struct maple_enode *enode; struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; void __rcu **slots; if (mas->max >= max) goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); node_end = ma_data_end(node, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); mas->offset++; enode = mas_slot(mas, slots, mas->offset); if (unlikely(ma_dead_node(node))) return 1; if (level > 1) mas->offset = 0; while (unlikely(level > 1)) { level--; mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; } if (!mas->offset) pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); tmp = mte_to_node(enode); mt = mte_node_type(enode); pivots = ma_pivots(tmp, mt); mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; return 0; overflow: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_overflow; return 0; } /* * mas_next_slot() - Get the entry in the next slot * * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty * * Return: The entry in the next slot which is possibly NULL */ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; unsigned long pivot; enum maple_type type; struct maple_node *node; unsigned long save_point = mas->last; void *entry; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot >= max) { /* Was at the limit, next will extend beyond */ mas->status = ma_overflow; return NULL; } } if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; } else { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; mas->offset = 0; mas->index = mas->min; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->last = pivots[0]; } slots = ma_slots(node, type); entry = mt_slot(mas->tree, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (entry) return entry; if (!empty) { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } mas->index = mas->last + 1; goto again; } return entry; } /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. * @mas: The maple state * @size: The needed size. * * Return: True if found in a leaf, false otherwise. * */ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) return true; if (ma_is_dense(type)) { /* dense nodes. */ mas->offset = (unsigned char)(mas->index - mas->min); return true; } pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); /* Skip out of bounds. */ while (mas->last < min) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = max - min + 1; if (gap) { if ((size <= gap) && (size <= mas->last - min + 1)) break; if (!gaps) { /* Skip the next slot, it cannot be a gap. */ if (offset < 2) goto ascend; offset -= 2; max = pivots[offset]; min = mas_safe_min(mas, pivots, offset); continue; } } if (!offset) goto ascend; offset--; max = min - 1; min = mas_safe_min(mas, pivots, offset); } if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; *gap_min = min; *gap_max = min + gap - 1; return true; } /* descend, only happens under lock. */ mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = max; mas->offset = mas_data_end(mas); return false; ascend: if (!mte_is_root(mas->node)) return false; no_space: mas_set_err(mas, -EBUSY); return false; } static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; unsigned char offset, data_end; unsigned long *gaps, *pivots; void __rcu **slots; struct maple_node *node; bool found = false; if (ma_is_dense(type)) { mas->offset = (unsigned char)(mas->index - mas->min); return true; } node = mas_mn(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) goto next_slot; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = min(pivot, mas->last) - max(mas->index, min) + 1; else goto next_slot; if (gap >= size) { if (ma_is_leaf(type)) { found = true; break; } mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = pivot; offset = 0; break; } next_slot: min = pivot + 1; if (mas->last <= pivot) { mas_set_err(mas, -EBUSY); return true; } } mas->offset = offset; return found; } /** * mas_walk() - Search for @mas->index in the tree. * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ void *mas_walk(struct ma_state *mas) { void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { goto retry; } else if (mas_is_none(mas)) { mas->index = 0; mas->last = ULONG_MAX; } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; return entry; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return NULL; } return entry; } EXPORT_SYMBOL_GPL(mas_walk); static inline bool mas_rewind_node(struct ma_state *mas) { unsigned char slot; do { if (mte_is_root(mas->node)) { slot = mas->offset; if (!slot) return false; } else { mas_ascend(mas); slot = mas->offset; } } while (!slot); mas->offset = --slot; return true; } /* * mas_skip_node() - Internal function. Skip over a node. * @mas: The maple state. * * Return: true if there is another node, false otherwise. */ static inline bool mas_skip_node(struct ma_state *mas) { if (mas_is_err(mas)) return false; do { if (mte_is_root(mas->node)) { if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); } } while (mas->offset >= mas_data_end(mas)); mas->offset++; return true; } /* * mas_awalk() - Allocation walk. Search from low address to high, for a gap of * @size * @mas: The maple state * @size: The size of the gap required * * Search between @mas->index and @mas->last for a gap of @size. */ static inline void mas_awalk(struct ma_state *mas, unsigned long size) { struct maple_enode *last = NULL; /* * There are 4 options: * go to child (descend) * go back to parent (ascend) * no gap found. (return, error == -EBUSY) * found the gap. (return) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) mas_skip_node(mas); else last = mas->node; } } /* * mas_sparse_area() - Internal function. Return upper or lower limit when * searching for a gap in an empty tree. * @mas: The maple state * @min: the minimum range * @max: The maximum range * @size: The size of the gap * @fwd: Searching forward or back */ static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { if (!unlikely(mas_is_none(mas)) && min == 0) { min++; /* * At this time, min is increased, we need to recheck whether * the size is satisfied. */ if (min > max || max - min + 1 < size) return -EBUSY; } /* mas_is_ptr */ if (fwd) { mas->index = min; mas->last = min + size - 1; } else { mas->last = max; mas->index = max - size + 1; } return 0; } /* * mas_empty_area() - Get the lowest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { unsigned char offset; unsigned long *pivots; enum maple_type mt; struct maple_node *node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) mas->offset -= 2; else if (!mas_skip_node(mas)) return -EBUSY; /* Empty set */ if (mas_is_none(mas) || mas_is_ptr(mas)) return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; mas->last = max; mas_awalk(mas, size); if (unlikely(mas_is_err(mas))) return xa_err(mas->node); offset = mas->offset; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); /* * mas_empty_area_rev() - Get the highest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { struct maple_enode *last = mas->node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); else if (mas->offset >= 2) mas->offset -= 2; else mas->offset = mas_data_end(mas); /* The start of the window can only be within these values. */ mas->index = min; mas->last = max; while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; } else { last = mas->node; } } if (mas_is_err(mas)) return xa_err(mas->node); if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; /* Trim the upper limit to the max. */ if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. * @enode: the encoded node * @mt: the maple tree * @slots: Pointer to the slot array * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; for (offset = 0; offset < mt_slot_count(enode); offset++) { entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ if (!node || !type) break; mte_set_node_dead(entry); node->type = type; rcu_assign_pointer(slots[offset], node); } return offset; } /** * mte_dead_walk() - Walk down a dead tree to just before the leaves * @enode: The maple encoded node * @offset: The starting offset * * Note: This can only be used from the RCU callback context. */ static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; next = mte_to_node(*enode); do { *enode = ma_enode_ptr(next); node = mte_to_node(*enode); slots = ma_slots(node, node->type); next = rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } /** * mt_free_walk() - Walk & free a tree in the RCU callback context * @head: The RCU head that's within the node. * * Note: This can only be used from the RCU callback context. */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; struct maple_enode *enode; unsigned char offset; enum maple_type type; node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; start = node; enode = mt_mk_node(node, node->type); slots = mte_dead_walk(&enode, 0); node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if ((offset < mt_slots[type]) && rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map))) slots = mte_dead_walk(&enode, offset); node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); free_leaf: mt_free_rcu(&node->rcu); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; struct maple_enode *next = *enode; void __rcu **slots = NULL; enum maple_type type; unsigned char next_offset = 0; do { *enode = next; node = mte_to_node(*enode); type = mte_node_type(*enode); slots = ma_slots(node, type); next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) next = mt_slot_locked(mt, slots, ++next_offset); mte_set_node_dead(*enode); node->type = type; node->piv_parent = prev; node->parent_slot = offset; offset = next_offset; next_offset = 0; prev = *enode; } while (!mte_is_leaf(next)); return slots; } static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } start = enode; slots = mte_destroy_descend(&enode, mt, start, 0); node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { parent = enode; enode = tmp; slots = mte_destroy_descend(&enode, mt, parent, offset); } next: node = mte_to_node(enode); } while (start != enode); node = mte_to_node(enode); node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); free_leaf: if (free) mt_free_rcu(&node->rcu); else mt_clear_meta(mt, node, node->type); } /* * mte_destroy_walk() - Free a tree or sub-tree. * @enode: the encoded maple node (maple_enode) to start * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_tree *mt) { struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { mt_destroy_walk(enode, mt, true); } } /* Interface */ /** * mas_store() - Store an @entry. * @mas: The maple state. * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; } #endif /* * Storing is the same operation as insert with the added caveat that it * can overwrite entries. Although this seems simple enough, one may * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); return wr_mas.content; } request = mas_prealloc_calc(&wr_mas, entry); if (!request) goto store; mas_node_count(mas, request); if (mas_is_err(mas)) return NULL; store: mas_wr_store_entry(&wr_mas); mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); /** * mas_store_gfp() - Store a value into the tree. * @mas: The maple state * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations if necessary. * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); int ret = 0; retry: mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } if (mas_is_err(mas)) { ret = xa_err(mas->node); goto out; } mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); /** * mas_store_prealloc() - Store a value into the tree using memory * preallocated in the maple state. * @mas: The maple state * @entry: The entry to store. */ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); if (mas->store_type == wr_store_root) { mas_wr_prealloc_setup(&wr_mas); goto store; } mas_wr_walk_descend(&wr_mas); if (mas->store_type != wr_spanning_store) { /* set wr_mas->content to current slot */ wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); mas_wr_end_piv(&wr_mas); } store: trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); int ret = 0; int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(&wr_mas, entry); if (!request) return ret; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); ret = xa_err(mas->node); mas_destroy(mas); mas_reset(mas); return ret; } mas->mas_flags |= MA_STATE_PREALLOC; return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. * @mas: The maple state * * Upon completion, check the left-most node and rebalance against the node to * the right if necessary. Frees any allocated nodes associated with this maple * state. */ void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, * it is possible that the number inserted is less than the expected * number. To fix an invalid final node, a check is performed here to * rebalance the previous node with the final node. */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; if (mas_is_err(mas)) mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; if (end < mt_min_slot_count(mas->node) - 1) mas_destroy_rebalance(mas, end); mas->mas_flags &= ~MA_STATE_REBALANCE; } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); total = mas_allocated(mas); while (total) { node = mas->alloc; mas->alloc = node->slot[0]; if (node->node_count > 1) { size_t count = node->node_count - 1; mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } mt_free_one(ma_mnode_ptr(node)); total--; } mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); /* * mas_expected_entries() - Set the expected number of entries that will be inserted. * @mas: The maple state * @nr_entries: The number of expected entries. * * This will attempt to pre-allocate enough nodes to store the expected number * of entries. The allocations will occur using the bulk allocator interface * for speed. Please call mas_destroy() on the @mas after inserting the entries * to ensure any unused nodes are freed. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) { int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; struct maple_enode *enode = mas->node; int nr_nodes; int ret; /* * Sometimes it is necessary to duplicate a tree to a new tree, such as * forking a process and duplicating the VMAs from one tree to a new * tree. When such a situation arises, it is known that the new tree is * not going to be used until the entire tree is populated. For * performance reasons, it is best to use a bulk load with RCU disabled. * This allows for optimistic splitting that favours the left and reuse * of nodes during the operation. */ /* Optimize splitting for bulk insert in-order */ mas->mas_flags |= MA_STATE_BULK; /* * Avoid overflow, assume a gap between each entry and a trailing null. * If this is wrong, it just means allocation can happen during * insertion of entries. */ nr_nodes = max(nr_entries, nr_entries * 2 + 1); if (!mt_is_alloc(mas->tree)) nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; /* Leaves; reduce slots to keep space for expansion */ nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); /* Internal nodes */ nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); /* Add working room for split (2 nodes) + new parents */ mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); /* Detect if allocations run out */ mas->mas_flags |= MA_STATE_PREALLOC; if (!mas_is_err(mas)) return 0; ret = xa_err(mas->node); mas->node = enode; mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_expected_entries); static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { mas->status = ma_overflow; return true; } switch (mas->status) { case ma_active: return false; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; fallthrough; case ma_start: mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ break; case ma_overflow: /* Overflowed before, but the max changed */ mas->status = ma_active; break; case ma_underflow: /* The user expects the mas to be one before where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (likely(mas_is_active(mas))) /* Fast path */ return false; if (mas_is_ptr(mas)) { *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; return true; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return true; } if (mas_is_none(mas)) return true; return false; } /** * mas_next() - Get the next entry. * @mas: The maple state * @max: The maximum index to check. * * Returns the next entry after @mas->index. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); /** * mas_next_range() - Advance the maple state to the next range * @mas: The maple state * @max: The maximum index to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree * @mt: The maple tree * @index: The start index * @max: The maximum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_next(&mas, max); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } switch (mas->status) { case ma_active: return false; case ma_start: break; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; break; case ma_underflow: /* underflowed before but the min changed */ mas->status = ma_active; break; case ma_overflow: /* User expects mas to be one after where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { if (!mas->index) { mas->status = ma_none; return true; } mas->index = mas->last = 0; *entry = mas_root(mas); return true; } if (mas_is_none(mas)) { if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } return true; } return false; } /** * mas_prev() - Get the previous entry * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); /** * mas_prev_range() - Advance to the previous range * @mas: The maple state * @min: The minimum value to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev_range(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree * @index: The start index * @min: The minimum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_prev(&mas, min); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_prev); /** * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. * @mas: The maple state to pause * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @mas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call mas_pause(), the mt_for_each() * iterator may be more appropriate. * */ void mas_pause(struct ma_state *mas) { mas->status = ma_pause; mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); /** * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state * @max: The maximum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { switch (mas->status) { case ma_active: if (mas->last < max) return false; return true; case ma_start: break; case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; mas->status = ma_start; break; case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; mas->status = ma_start; break; case ma_underflow: /* mas is pointing at entry before unable to go lower */ if (unlikely(mas->index >= max)) { mas->status = ma_overflow; return true; } mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_overflow: if (unlikely(mas->last >= max)) return true; mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index > max) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_none(mas))) return true; if (mas->index == max) return true; return false; ptr_out_of_range: mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; } /** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ entry = mas_next_slot(mas, max, false); /* Ignore overflow */ mas->status = ma_active; return entry; } EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_range() - On the first call, find the entry at or after * mas->index up to %max. Otherwise, advance to the next slot mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); /** * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state * @min: The minimum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { switch (mas->status) { case ma_active: goto active; case ma_start: break; case ma_pause: if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->last = --mas->index; mas->status = ma_start; break; case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; mas->status = ma_start; break; case ma_overflow: /* user expects the mas to be one after where it is */ if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->status = ma_active; break; case ma_underflow: /* user expects the mas to be one before where it is */ if (unlikely(mas->index <= min)) return true; mas->status = ma_active; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index < min) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto none; if (unlikely(mas_is_none(mas))) { /* * Walked to the location, and there was nothing so the previous * location is 0. */ mas->last = mas->index = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } active: if (mas->index < min) return true; return false; none: mas->status = ma_none; return true; } /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); /** * mas_find_range_rev: On the first call, find the first non-null entry at or * below mas->index down to %min. Otherwise advance to the previous slot after * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire * range. * @mas: The maple state * * Must hold the write lock. * Searches for @mas->index, sets @mas->index and @mas->last to the range and * erases that range. * * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. */ void *mas_erase(struct ma_state *mas) { void *entry; unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_preallocate(&wr_mas, NULL); if (mas_nomem(mas, GFP_KERNEL)) { /* in case the range of entry changed when unlocked */ mas->index = mas->last = index; goto write_retry; } if (mas_is_err(mas)) goto out; mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); /** * mas_nomem() - Check if there was an error allocating and do the allocation * if necessary If there are allocations, then free them. * @mas: The maple state * @gfp: The GFP_FLAGS to use for allocations * Return: true on allocation, false otherwise. */ bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); mas_alloc_nodes(mas, gfp); mtree_lock(mas->tree); } else { mas_alloc_nodes(mas, gfp); } if (!mas_allocated(mas)) return false; mas->status = ma_start; return true; } void __init maple_tree_init(void) { maple_node_cache = kmem_cache_create("maple_node", sizeof(struct maple_node), sizeof(struct maple_node), SLAB_PANIC, NULL); } /** * mtree_load() - Load a value stored in a maple tree * @mt: The maple tree * @index: The index to load * * Return: the entry or %NULL */ void *mtree_load(struct maple_tree *mt, unsigned long index) { MA_STATE(mas, mt, index, index); void *entry; trace_ma_read(__func__, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); if (unlikely(mas_is_none(&mas))) goto unlock; if (unlikely(mas_is_ptr(&mas))) { if (index) entry = NULL; goto unlock; } entry = mtree_lookup_walk(&mas); if (!entry && unlikely(mas_is_start(&mas))) goto retry; unlock: rcu_read_unlock(); if (xa_is_zero(entry)) return NULL; return entry; } EXPORT_SYMBOL(mtree_load); /** * mtree_store_range() - Store an entry at a given range. * @mt: The maple tree * @index: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (index > last) return -EINVAL; mtree_lock(mt); ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_store_range); /** * mtree_store() - Store an entry at a given index. * @mt: The maple tree * @index: The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_store_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_store); /** * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (first > last) return -EINVAL; mtree_lock(mt); retry: mas_insert(&ms, entry); if (mas_nomem(&ms, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&ms)) ret = xa_err(ms.node); mas_destroy(&ms); return ret; } EXPORT_SYMBOL(mtree_insert_range); /** * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_insert_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_insert); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); /** * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. * @mt: The maple tree. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Finds an empty entry in @mt after @next, stores the new index into * the @id pointer, stores the entry at that index, then updates @next. * * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. * * Context: Any context. Takes and releases the mt.lock. May sleep if * the @gfp flags permit. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no * free entries. */ int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { int ret; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, next, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_cyclic); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area_rev(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); /** * mtree_erase() - Find an index and erase the entire range. * @mt: The maple tree * @index: The index to erase * * Erasing is the same as a walk to an entry then a store of a NULL to that * ENTIRE range. In fact, it is implemented as such using the advanced API. * * Return: The entry stored at the @index or %NULL */ void *mtree_erase(struct maple_tree *mt, unsigned long index) { void *entry = NULL; MA_STATE(mas, mt, index, index); trace_ma_op(__func__, &mas); mtree_lock(mt); entry = mas_erase(&mas); mtree_unlock(mt); return entry; } EXPORT_SYMBOL(mtree_erase); /* * mas_dup_free() - Free an incomplete duplication of a tree. * @mas: The maple state of a incomplete tree. * * The parameter @mas->node passed in indicates that the allocation failed on * this node. This function frees all nodes starting from @mas->node in the * reverse order of mas_dup_build(). There is no need to hold the source tree * lock at this time. */ static void mas_dup_free(struct ma_state *mas) { struct maple_node *node; enum maple_type type; void __rcu **slots; unsigned char count, i; /* Maybe the first node allocation failed. */ if (mas_is_none(mas)) return; while (!mte_is_root(mas->node)) { mas_ascend(mas); if (mas->offset) { mas->offset--; do { mas_descend(mas); mas->offset = mas_data_end(mas); } while (!mte_is_leaf(mas->node)); mas_ascend(mas); } node = mte_to_node(mas->node); type = mte_node_type(mas->node); slots = ma_slots(node, type); count = mas_data_end(mas) + 1; for (i = 0; i < count; i++) ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; mt_free_bulk(count, slots); } node = mte_to_node(mas->node); mt_free_one(node); } /* * mas_copy_node() - Copy a maple node and replace the parent. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @parent: The parent of the new node. * * Copy @mas->node to @new_mas->node, set @parent to be the parent of * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, struct maple_pnode *parent) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); unsigned long val; /* Copy the node completely. */ memcpy(new_node, node, sizeof(struct maple_node)); /* Update the parent node pointer. */ val = (unsigned long)node->parent & MAPLE_NODE_MASK; new_node->parent = ma_parent_ptr(val | (unsigned long)parent); } /* * mas_dup_alloc() - Allocate child nodes for a maple node. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @gfp: The GFP_FLAGS to use for allocations. * * This function allocates child nodes for @new_mas->node during the duplication * process. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; unsigned char request, count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); request = mas_data_end(mas) + 1; count = mt_alloc_bulk(gfp, request, (void **)new_slots); if (unlikely(count < request)) { memset(new_slots, 0, request * sizeof(void *)); mas_set_err(mas, -ENOMEM); return; } /* Restore node type information in slots. */ slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; ((unsigned long *)new_slots)[i] |= val; } } /* * mas_dup_build() - Build a new maple tree from a source tree * @mas: The maple state of source tree, need to be in MAS_START state. * @new_mas: The maple state of new tree, need to be in MAS_START state. * @gfp: The GFP_FLAGS to use for allocations. * * This function builds a new tree in DFS preorder. If the memory allocation * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the * last node. mas_dup_free() will free the incomplete duplication of a tree. * * Note that the attributes of the two trees need to be exactly the same, and the * new tree needs to be empty, otherwise -EINVAL will be set in @mas. */ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node; struct maple_pnode *parent = NULL; struct maple_enode *root; enum maple_type type; if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || unlikely(!mtree_empty(new_mas->tree))) { mas_set_err(mas, -EINVAL); return; } root = mas_start(mas); if (mas_is_ptr(mas) || mas_is_none(mas)) goto set_new_tree; node = mt_alloc_one(gfp); if (!node) { new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } type = mte_node_type(mas->node); root = mt_mk_node(node, type); new_mas->node = root; new_mas->min = 0; new_mas->max = ULONG_MAX; root = mte_mk_root(root); while (1) { mas_copy_node(mas, new_mas, parent); if (!mte_is_leaf(mas->node)) { /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) return; } else { /* * This is the last leaf node and duplication is * completed. */ if (mas->max == ULONG_MAX) goto done; /* This is not the last leaf node and needs to go up. */ do { mas_ascend(mas); mas_ascend(new_mas); } while (mas->offset == mas_data_end(mas)); /* Move to the next subtree. */ mas->offset++; new_mas->offset++; } mas_descend(mas); parent = ma_parent_ptr(mte_to_node(new_mas->node)); mas_descend(new_mas); mas->offset = 0; new_mas->offset = 0; } done: /* Specially handle the parent of the root node. */ mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); } /** * __mt_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * Note that the user needs to manually lock the source tree and the new tree. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_dup_build(&mas, &new_mas, gfp); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } return ret; } EXPORT_SYMBOL(__mt_dup); /** * mtree_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_lock(&new_mas); mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); mas_dup_build(&mas, &new_mas, gfp); mas_unlock(&mas); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } mas_unlock(&new_mas); return ret; } EXPORT_SYMBOL(mtree_dup); /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree * * Note: Does not handle locking. */ void __mt_destroy(struct maple_tree *mt) { void *root = mt_root_locked(mt); rcu_assign_pointer(mt->ma_root, NULL); if (xa_is_node(root)) mte_destroy_walk(root, mt); mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); /** * mtree_destroy() - Destroy a maple tree * @mt: The maple tree * * Frees all resources used by the tree. Handles locking. */ void mtree_destroy(struct maple_tree *mt) { mtree_lock(mt); __mt_destroy(mt); mtree_unlock(mt); } EXPORT_SYMBOL(mtree_destroy); /** * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value of the search range * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * In case that an entry is found @index is updated to point to the next * possible entry independent whether the found entry is occupying a * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) { MA_STATE(mas, mt, *index, *index); void *entry; #ifdef CONFIG_DEBUG_MAPLE_TREE unsigned long copy = *index; #endif trace_ma_read(__func__, &mas); if ((*index) > max) return NULL; rcu_read_lock(); retry: entry = mas_state_walk(&mas); if (mas_is_start(&mas)) goto retry; if (unlikely(xa_is_zero(entry))) entry = NULL; if (entry) goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } if (unlikely(xa_is_zero(entry))) entry = NULL; unlock: rcu_read_unlock(); if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); #endif } return entry; } EXPORT_SYMBOL(mt_find); /** * mt_find_after() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value to check * * Same as mt_find() except that it checks @index for 0 before * searching. If @index == 0, the search is aborted. This covers a wrap * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max) { if (!(*index)) return NULL; return mt_find(mt, index, max); } EXPORT_SYMBOL(mt_find_after); #ifdef CONFIG_DEBUG_MAPLE_TREE atomic_t maple_tree_tests_run; EXPORT_SYMBOL_GPL(maple_tree_tests_run); atomic_t maple_tree_tests_passed; EXPORT_SYMBOL_GPL(maple_tree_tests_passed); #ifndef __KERNEL__ extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); void mt_set_non_kernel(unsigned int val) { kmem_cache_set_non_kernel(maple_node_cache, val); } extern void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)); void mt_set_callback(void (*callback)(void *)) { kmem_cache_set_callback(maple_node_cache, callback); } extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); void mt_set_private(void *private) { kmem_cache_set_private(maple_node_cache, private); } extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { return kmem_cache_get_alloc(maple_node_cache); } extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); void mt_zero_nr_tallocated(void) { kmem_cache_zero_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); unsigned int mt_nr_tallocated(void) { return kmem_cache_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); unsigned int mt_nr_allocated(void) { return kmem_cache_nr_allocated(maple_node_cache); } void mt_cache_shrink(void) { } #else /* * mt_cache_shrink() - For testing, don't use this. * * Certain testcases can trigger an OOM when combined with other memory * debugging configuration options. This function is used to reduce the * possibility of an out of memory even due to kmem_cache objects remaining * around for longer than usual. */ void mt_cache_shrink(void) { kmem_cache_shrink(maple_node_cache); } EXPORT_SYMBOL_GPL(mt_cache_shrink); #endif /* not defined __KERNEL__ */ /* * mas_get_slot() - Get the entry in the maple state node stored at @offset. * @mas: The maple state * @offset: The offset into the slot array to fetch. * * Return: The entry stored at @offset. */ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, unsigned char offset) { return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), offset); } /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) return; mas->node = mn; mas_ascend(mas); do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; mas->min = p_min; } /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; switch(format) { case mt_dump_hex: if (min == max) pr_info("%.*s%lx: ", depth * 2, spaces, min); else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); else pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { switch (format) { case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_ARANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i]) break; if (last == 0 && i > 0) break; if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; mt_dump_range(min, max, depth, format); pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); for (i = 0; i < MAPLE_NODE_SLOTS; i++) { if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: mt_dump_arange64(mt, entry, min, max, depth, format); break; default: pr_cont(" UNKNOWN TYPE\n"); } } void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); else if (entry) mt_dump_entry(entry, 0, 0, 0, format); else pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); /* * Calculate the maximum gap in a node and check if that's what is reported in * the parent (unless root). */ static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; struct maple_node *p_mn, *node = mte_to_node(mte); enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; unsigned char p_slot, offset; unsigned long *gaps = NULL; unsigned long *pivots = ma_pivots(node, mt); unsigned int i; if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) max_gap = gap; gap = 0; continue; } gap++; } goto counted; } gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; p_start = p_end + 1; if (p_end >= mas->max) break; } counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } } } if (mte_is_root(mte)) return; p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; enum maple_type p_type; unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; p_slot = mte_parent_slot(mas->node); p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); /* Check prev/next parent slot for duplicate node entry */ for (i = 0; i < mt_slots[p_type]; i++) { node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } } } static void mas_validate_child_slot(struct ma_state *mas) { enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); struct maple_enode *child; unsigned char i; if (mte_is_leaf(mas->node)) return; for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); if (!child) { pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } if (i < mt_pivots[type] && pivots[i] == mas->max) break; } } /* * Validate all pivots are within mas->min and mas->max, check metadata ends * where the maximum ends and ensure there is no slots or pivots set outside of * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { int i; unsigned long prev_piv = 0; enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) break; } if (mas_data_end(mas) != i) { pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } if (i < mt_pivots[type]) { unsigned long piv = pivots[i]; if (!piv) continue; pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } static void mt_validate_nulls(struct maple_tree *mt) { void *entry, *last = (void *)1; unsigned char offset = 0; void __rcu **slots; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); } else { offset++; } } while (!mas_is_overflow(&mas)); } /* * validate a maple tree by checking: * 1. The limits (pivots are within mas->min to mas->max) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (!mas_is_active(&mas)) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (!mte_is_root(mas.node)))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); mas_validate_limits(&mas); mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); } EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); break; case ma_none: pr_err("(ma_none)"); break; case ma_root: pr_err("(ma_root)"); break; case ma_start: pr_err("(ma_start) "); break; case ma_pause: pr_err("(ma_pause) "); break; case ma_overflow: pr_err("(ma_overflow) "); break; case ma_underflow: pr_err("(ma_underflow) "); break; case ma_error: pr_err("(ma_error) "); break; } pr_err("Store Type: "); switch (mas->store_type) { case wr_invalid: pr_err("invalid store type\n"); break; case wr_new_root: pr_err("new_root\n"); break; case wr_store_root: pr_err("store_root\n"); break; case wr_exact_fit: pr_err("exact_fit\n"); break; case wr_split_store: pr_err("split_store\n"); break; case wr_slot_store: pr_err("slot_store\n"); break; case wr_append: pr_err("append\n"); break; case wr_node_store: pr_err("node_store\n"); break; case wr_spanning_store: pr_err("spanning_store\n"); break; case wr_rebalance: pr_err("rebalance\n"); break; } pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); #endif /* CONFIG_DEBUG_MAPLE_TREE */
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_INETDEVICE_H #define _LINUX_INETDEVICE_H #ifdef __KERNEL__ #include <linux/bitmap.h> #include <linux/if.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> #include <linux/timer.h> #include <linux/sysctl.h> #include <linux/rtnetlink.h> #include <linux/refcount.h> struct ipv4_devconf { void *sysctl; int data[IPV4_DEVCONF_MAX]; DECLARE_BITMAP(state, IPV4_DEVCONF_MAX); }; #define MC_HASH_SZ_LOG 9 struct in_device { struct net_device *dev; netdevice_tracker dev_tracker; refcount_t refcnt; int dead; struct in_ifaddr __rcu *ifa_list;/* IP ifaddr chain */ struct ip_mc_list __rcu *mc_list; /* IP multicast filter chain */ struct ip_mc_list __rcu * __rcu *mc_hash; int mc_count; /* Number of installed mcasts */ spinlock_t mc_tomb_lock; struct ip_mc_list *mc_tomb; unsigned long mr_v1_seen; unsigned long mr_v2_seen; unsigned long mr_maxdelay; unsigned long mr_qi; /* Query Interval */ unsigned long mr_qri; /* Query Response Interval */ unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; u32 mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ struct timer_list mr_ifc_timer; /* interface change timer */ struct neigh_parms *arp_parms; struct ipv4_devconf cnf; struct rcu_head rcu_head; }; #define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1]) #define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr)) #define IPV4_DEVCONF_ALL(net, attr) \ IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr) #define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr)) static inline int ipv4_devconf_get(const struct in_device *in_dev, int index) { index--; return READ_ONCE(in_dev->cnf.data[index]); } static inline void ipv4_devconf_set(struct in_device *in_dev, int index, int val) { index--; set_bit(index, in_dev->cnf.state); WRITE_ONCE(in_dev->cnf.data[index], val); } static inline void ipv4_devconf_setall(struct in_device *in_dev) { bitmap_fill(in_dev->cnf.state, IPV4_DEVCONF_MAX); } #define IN_DEV_CONF_GET(in_dev, attr) \ ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr) #define IN_DEV_CONF_SET(in_dev, attr, val) \ ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val)) #define IN_DEV_ANDCONF(in_dev, attr) \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_NET_ORCONF(in_dev, net, attr) \ (IPV4_DEVCONF_ALL_RO(net, attr) || \ IN_DEV_CONF_GET((in_dev), attr)) #define IN_DEV_ORCONF(in_dev, attr) \ IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr) #define IN_DEV_MAXCONF(in_dev, attr) \ (max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \ IN_DEV_CONF_GET((in_dev), attr))) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) #define IN_DEV_BFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), BC_FORWARDING) #define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) #define IN_DEV_SRC_VMARK(in_dev) IN_DEV_ORCONF((in_dev), SRC_VMARK) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ ACCEPT_SOURCE_ROUTE) #define IN_DEV_ACCEPT_LOCAL(in_dev) IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL) #define IN_DEV_BOOTP_RELAY(in_dev) IN_DEV_ANDCONF((in_dev), BOOTP_RELAY) #define IN_DEV_LOG_MARTIANS(in_dev) IN_DEV_ORCONF((in_dev), LOG_MARTIANS) #define IN_DEV_PROXY_ARP(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP) #define IN_DEV_PROXY_ARP_PVLAN(in_dev) IN_DEV_ORCONF((in_dev), PROXY_ARP_PVLAN) #define IN_DEV_SHARED_MEDIA(in_dev) IN_DEV_ORCONF((in_dev), SHARED_MEDIA) #define IN_DEV_TX_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), SEND_REDIRECTS) #define IN_DEV_SEC_REDIRECTS(in_dev) IN_DEV_ORCONF((in_dev), \ SECURE_REDIRECTS) #define IN_DEV_IDTAG(in_dev) IN_DEV_CONF_GET(in_dev, TAG) #define IN_DEV_MEDIUM_ID(in_dev) IN_DEV_CONF_GET(in_dev, MEDIUM_ID) #define IN_DEV_PROMOTE_SECONDARIES(in_dev) \ IN_DEV_ORCONF((in_dev), \ PROMOTE_SECONDARIES) #define IN_DEV_ROUTE_LOCALNET(in_dev) IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET) #define IN_DEV_NET_ROUTE_LOCALNET(in_dev, net) \ IN_DEV_NET_ORCONF(in_dev, net, ROUTE_LOCALNET) #define IN_DEV_RX_REDIRECTS(in_dev) \ ((IN_DEV_FORWARD(in_dev) && \ IN_DEV_ANDCONF((in_dev), ACCEPT_REDIRECTS)) \ || (!IN_DEV_FORWARD(in_dev) && \ IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS))) #define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \ IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN) #define IN_DEV_ARPFILTER(in_dev) IN_DEV_ORCONF((in_dev), ARPFILTER) #define IN_DEV_ARP_ACCEPT(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ACCEPT) #define IN_DEV_ARP_ANNOUNCE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE) #define IN_DEV_ARP_IGNORE(in_dev) IN_DEV_MAXCONF((in_dev), ARP_IGNORE) #define IN_DEV_ARP_NOTIFY(in_dev) IN_DEV_MAXCONF((in_dev), ARP_NOTIFY) #define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \ ARP_EVICT_NOCARRIER) struct in_ifaddr { struct hlist_node addr_lst; struct in_ifaddr __rcu *ifa_next; struct in_device *ifa_dev; struct rcu_head rcu_head; __be32 ifa_local; __be32 ifa_address; __be32 ifa_mask; __u32 ifa_rt_priority; __be32 ifa_broadcast; unsigned char ifa_scope; unsigned char ifa_prefixlen; unsigned char ifa_proto; __u32 ifa_flags; char ifa_label[IFNAMSIZ]; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 ifa_valid_lft; __u32 ifa_preferred_lft; unsigned long ifa_cstamp; /* created timestamp */ unsigned long ifa_tstamp; /* updated timestamp */ }; struct in_validator_info { __be32 ivi_addr; struct in_device *ivi_dev; struct netlink_ext_ack *extack; }; int register_inetaddr_notifier(struct notifier_block *nb); int unregister_inetaddr_notifier(struct notifier_block *nb); int register_inetaddr_validator_notifier(struct notifier_block *nb); int unregister_inetaddr_validator_notifier(struct notifier_block *nb); void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf); struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref); static inline struct net_device *ip_dev_find(struct net *net, __be32 addr) { return __ip_dev_find(net, addr, true); } int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b); int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *); #ifdef CONFIG_INET int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size); #else static inline int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { return 0; } #endif void devinet_init(void); struct in_device *inetdev_by_index(struct net *, int); __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope); __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope); struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask); struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr); static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa) { return !((addr^ifa->ifa_address)&ifa->ifa_mask); } /* * Check if a mask is acceptable. */ static __inline__ bool bad_mask(__be32 mask, __be32 addr) { __u32 hmask; if (addr & (mask = ~mask)) return true; hmask = ntohl(mask); if (hmask & (hmask+1)) return true; return false; } #define in_dev_for_each_ifa_rtnl(ifa, in_dev) \ for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa; \ ifa = rtnl_dereference(ifa->ifa_next)) #define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) \ for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa; \ ifa = rtnl_net_dereference(net, ifa->ifa_next)) #define in_dev_for_each_ifa_rcu(ifa, in_dev) \ for (ifa = rcu_dereference((in_dev)->ifa_list); ifa; \ ifa = rcu_dereference(ifa->ifa_next)) static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->ip_ptr); } static inline struct in_device *in_dev_get(const struct net_device *dev) { struct in_device *in_dev; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) refcount_inc(&in_dev->refcnt); rcu_read_unlock(); return in_dev; } static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->ip_ptr); } static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev) { return rtnl_net_dereference(dev_net(dev), dev->ip_ptr); } /* called with rcu_read_lock or rtnl held */ static inline bool ip_ignore_linkdown(const struct net_device *dev) { struct in_device *in_dev; bool rc = false; in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) rc = true; return rc; } static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); return in_dev ? in_dev->arp_parms : NULL; } void in_dev_finish_destroy(struct in_device *idev); static inline void in_dev_put(struct in_device *idev) { if (refcount_dec_and_test(&idev->refcnt)) in_dev_finish_destroy(idev); } #define __in_dev_put(idev) refcount_dec(&(idev)->refcnt) #define in_dev_hold(idev) refcount_inc(&(idev)->refcnt) #endif /* __KERNEL__ */ static __inline__ __be32 inet_make_mask(int logmask) { if (logmask) return htonl(~((1U<<(32-logmask))-1)); return 0; } static __inline__ int inet_mask_len(__be32 mask) { __u32 hmask = ntohl(mask); if (!hmask) return 0; return 32 - ffz(~hmask); } #endif /* _LINUX_INETDEVICE_H */
25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * NOHZ implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/compiler.h> #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } /* * The time when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64-bit can do a quick check without holding the jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32-bit cannot do that because the store of 'tick_next_period' * consists of two 32-bit stores, and the first store could be * moved by the CPU to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on 'jiffies_lock' and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Re-evaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the 'jiffies_seq' protected job */ jiffies_64 += ticks; /* Keep the tick_next_period variable up to date */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above, and ensures that the update to 'jiffies_64' is * not reordered vs. the store to 'tick_next_period', neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32-bit, as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but 'jiffies_lock' needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Have we started the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } static inline int tick_sched_flag_test(struct tick_sched *ts, unsigned long flag) { return !!(ts->flags & flag); } static inline void tick_sched_flag_set(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags |= flag; } static inline void tick_sched_flag_clear(struct tick_sched *ts, unsigned long flag) { lockdep_assert_irqs_disabled(); ts->flags &= ~flag; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int tick_cpu, cpu = smp_processor_id(); /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * 'jiffies_lock'. * * If nohz_full is enabled, this should not happen because the * 'tick_do_timer_cpu' CPU never relinquishes. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif WRITE_ONCE(tick_do_timer_cpu, cpu); tick_cpu = cpu; } /* Check if jiffies need an update */ if (tick_cpu == cpu) tick_do_update_jiffies64(now); /* * If the jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on completely idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too many ticks. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call when we are not in IRQ context and have * no valid 'regs' pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* * In dynticks mode, tick reprogram is deferred: * - to the idle task if in dynticks-idle * - to IRQ exit if in full-dynticks. */ if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED))) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(&current->tick_dep_mask)) return false; if (check_tick_dependency(&current->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, and an IPI can then be optimized out. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask * * XXX given a task picks up the dependency on schedule(), should we * only care about tasks that are currently on the CPU instead of all * that are on the runqueue? * * That is, does this want to be: task_on_cpu() / task_curr()? */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * unstable clocks. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage event-throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote IRQ work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU needs this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { if (atomic_read(&current->tick_dep_mask) || atomic_read(&current->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses IRQ work to drive the tick rescheduling on safe * locking contexts. But then we need IRQ work to raise its own * interrupts to avoid circular dependency on the tick. */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* #ifdef CONFIG_NO_HZ_FULL */ /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return tick_sched_flag_test(ts, TS_FLAG_STOPPED); } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * @now: current ktime_t * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned, is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE); write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note that this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure the next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_timers_pending() & BIT(TIMER_SOFTIRQ); } /* * Read jiffies and the time when jiffies were updated last */ u64 get_jiffies_update(unsigned long *basej) { unsigned long basejiff; unsigned int seq; u64 basemono; do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); *basej = basejiff; return basemono; } /** * tick_nohz_next_event() - return the clock monotonic based next event * @ts: pointer to tick_sched struct * @cpu: CPU number * * Return: * *%0 - When the next event is a maximum of TICK_NSEC in the future * and the tick is not stopped yet * *%next_event - Next event based on clock monotonic */ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; int tick_cpu; basemono = get_jiffies_update(&basejiff); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that, check whether the local timer softirq is * pending. If so, its a bad idea to call get_next_timer_interrupt(), * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta, which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* Make sure next_tick is never before basemono! */ if (WARN_ON_ONCE(basemono > next_tick)) next_tick = basemono; /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping 'max_deferment' value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu != cpu && (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST))) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); unsigned long basejiff = ts->last_jiffies; u64 basemono = ts->timer_expires_base; bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED); int tick_cpu; u64 expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * Now the tick should be stopped definitely - so the timer base needs * to be marked idle as well to not miss a newly queued timer. */ expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle); if (expires > ts->timer_expires) { /* * This path could only happen when the first timer was removed * between calculating the possible sleep length and now (when * high resolution mode is not active, timer could also be a * hrtimer). * * We have to stick to the original calculated expiry value to * not stop the tick for too long with a shallow C-state (which * was programmed by cpuidle because of an early next expiration * value). */ expires = ts->timer_expires; } /* If the timer base is not idle, retain the not yet stopped tick. */ if (!timer_idle) return; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here, the jiffies might be stale and * do_timer() never gets invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ tick_cpu = READ_ONCE(tick_do_timer_cpu); if (tick_cpu == cpu) { WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE); tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST); } else if (tick_cpu != TICK_DO_TIMER_NONE) { tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST); } /* Skip reprogram of event if it's not changed */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu " "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * tick_nohz_stop_tick() can be called several times before * tick_nohz_restart_sched_tick() is called. This happens when * interrupts arrive which do not cause a reschedule. In the first * call we save the current tick time, so we can restart the * scheduler tick in tick_nohz_restart_sched_tick(). */ if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); tick_sched_flag_set(ts, TS_FLAG_STOPPED); trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = expires; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, expires); tick_program_event(expires, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* Cancel the scheduled timer and restore the tick: */ tick_sched_flag_clear(ts, TS_FLAG_STOPPED); tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_full_stop_tick(ts, cpu); else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach this code due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the code below, since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } if (ratelimit >= 10) return false; /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { WARN_ON_ONCE(cpu_is_offline(cpu)); if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ))) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { int tick_cpu = READ_ONCE(tick_do_timer_cpu); /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_set(ts, TS_FLAG_INIDLE); tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - Notify the tick about IRQ exit * * A timer may have been added/modified/deleted either by the current IRQ, * or by another place using this IRQ as a notification. This IRQ may have * also updated the RCU callback list. These events may require a * re-evaluation of the next tick. Depending on the context: * * 1) If the CPU is idle and no resched is pending, just proceed with idle * time accounting. The next tick will be re-evaluated on the next idle * loop iteration. * * 2) If the CPU is nohz_full: * * 2.1) If there is any tick dependency, restart the tick if stopped. * * 2.2) If there is no tick dependency, (re-)evaluate the next tick and * stop/update it accordingly. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_INIDLE)) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run * * Return: %true if the tick handler has run, otherwise %false */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whichever expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled * * Return: the next expiration time */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. * * Return: the expected length of the current sleep */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * @cpu: target CPU number * * Called from the schedutil frequency scaling governor in scheduler context. * * Return: the current idle calls counter value for @cpu */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. update_process_times() would miss the * time we slept, as it does only a 1 tick accounting. * Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - Update the tick upon idle task exit * * When the idle task exits, update the tick depending on the * following situations: * * 1) If the CPU is not in nohz_full mode (most cases), then * restart the tick. * * 2) If the CPU is in nohz_full mode (corner case): * 2.1) If the tick can be kept stopped (no tick dependencies) * then re-evaluate the next tick and try to keep it stopped * as long as possible. * 2.2) If the tick has dependencies, restart the tick. * */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE)); WARN_ON_ONCE(ts->timer_expires_base); tick_sched_flag_clear(ts, TS_FLAG_INIDLE); idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE); tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED); if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * In low-resolution mode, the tick handler must be implemented directly * at the clockevent level. hrtimer can't be used instead, because its * infrastructure actually relies on the tick itself as a backend in * low-resolution mode (see hrtimer_run_queues()). */ static void tick_nohz_lowres_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts) { if (!tick_nohz_enabled) return; tick_sched_flag_set(ts, TS_FLAG_NOHZ); /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to NOHZ mode */ static void tick_nohz_switch_to_nohz(void) { if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_lowres_handler)) return; /* * Recycle the hrtimer in 'ts', so we can share the * highres code. */ tick_setup_sched_timer(false); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE)) return; now = ktime_get(); if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle we may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter() to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer * @hrtimer: whether to use the hrtimer or not */ void tick_setup_sched_timer(bool hrtimer) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert 'jiffies_lock' contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); else tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts); } /* * Shut down the tick and make sure the CPU won't try to retake the timekeeping * duty before disabling IRQs in idle for the last time. */ void tick_sched_timer_dying(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ktime_t idle_sleeptime, iowait_sleeptime; unsigned long idle_calls, idle_sleeps; /* This must happen before hrtimers are migrated! */ if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) hrtimer_cancel(&ts->sched_timer); idle_sleeptime = ts->idle_sleeptime; iowait_sleeptime = ts->iowait_sleeptime; idle_calls = ts->idle_calls; idle_sleeps = ts->idle_sleeps; memset(ts, 0, sizeof(*ts)); ts->idle_sleeptime = idle_sleeptime; ts->iowait_sleeptime = iowait_sleeptime; ts->idle_calls = idle_calls; ts->idle_sleeps = idle_sleeps; } /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check if a change happened, which makes oneshot possible. * * Called cyclically from the hrtimer softirq (driven by the timer * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (tick_sched_flag_test(ts, TS_FLAG_NOHZ)) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; }
3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-only /* * STP SAP demux * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/stp.h> /* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */ #define GARP_ADDR_MIN 0x20 #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; static DEFINE_MUTEX(stp_proto_mutex); /* Called under rcu_read_lock from LLC */ static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ethhdr *eh = eth_hdr(skb); const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); const struct stp_proto *proto; if (pdu->ssap != LLC_SAP_BSPAN || pdu->dsap != LLC_SAP_BSPAN || pdu->ctrl_1 != LLC_PDU_TYPE_U) goto err; if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) { proto = rcu_dereference(garp_protos[eh->h_dest[5] - GARP_ADDR_MIN]); if (proto && !ether_addr_equal(eh->h_dest, proto->group_address)) goto err; } else proto = rcu_dereference(stp_proto); if (!proto) goto err; proto->rcv(proto, skb, dev); return 0; err: kfree_skb(skb); return 0; } int stp_proto_register(const struct stp_proto *proto) { int err = 0; mutex_lock(&stp_proto_mutex); if (sap_registered++ == 0) { sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); if (!sap) { err = -ENOMEM; goto out; } } if (is_zero_ether_addr(proto->group_address)) rcu_assign_pointer(stp_proto, proto); else rcu_assign_pointer(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], proto); out: mutex_unlock(&stp_proto_mutex); return err; } EXPORT_SYMBOL_GPL(stp_proto_register); void stp_proto_unregister(const struct stp_proto *proto) { mutex_lock(&stp_proto_mutex); if (is_zero_ether_addr(proto->group_address)) RCU_INIT_POINTER(stp_proto, NULL); else RCU_INIT_POINTER(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], NULL); synchronize_rcu(); if (--sap_registered == 0) llc_sap_put(sap); mutex_unlock(&stp_proto_mutex); } EXPORT_SYMBOL_GPL(stp_proto_unregister); MODULE_DESCRIPTION("SAP demux for IEEE 802.1D Spanning Tree Protocol (STP)"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2016 ARM Ltd. */ #ifndef __ASM_CHECKSUM_H #define __ASM_CHECKSUM_H #include <linux/in6.h> #define _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum sum); static inline __sum16 csum_fold(__wsum csum) { u32 sum = (__force u32)csum; sum += (sum >> 16) | (sum << 16); return ~(__force __sum16)(sum >> 16); } #define csum_fold csum_fold static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { __uint128_t tmp; u64 sum; int n = ihl; /* we want it signed */ tmp = *(const __uint128_t *)iph; iph += 16; n -= 4; tmp += ((tmp >> 64) | (tmp << 64)); sum = tmp >> 64; do { sum += *(const u32 *)iph; iph += 4; } while (--n > 0); sum += ((sum >> 32) | (sum << 32)); return csum_fold((__force __wsum)(sum >> 32)); } #define ip_fast_csum ip_fast_csum extern unsigned int do_csum(const unsigned char *buff, int len); #define do_csum do_csum #include <asm-generic/checksum.h> #endif /* __ASM_CHECKSUM_H */
261 261 261 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 // SPDX-License-Identifier: GPL-2.0-only /* * Fixmap manipulation code */ #include <linux/bug.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/libfdt.h> #include <linux/memory.h> #include <linux/mm.h> #include <linux/sizes.h> #include <asm/fixmap.h> #include <asm/kernel-pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> /* ensure that the fixmap region does not grow down into the PCI I/O region */ static_assert(FIXADDR_TOT_START > PCI_IO_END); #define NR_BM_PTE_TABLES \ SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PMD_SHIFT) #define NR_BM_PMD_TABLES \ SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PUD_SHIFT) static_assert(NR_BM_PMD_TABLES == 1); #define __BM_TABLE_IDX(addr, shift) \ (((addr) >> (shift)) - (FIXADDR_TOT_START >> (shift))) #define BM_PTE_TABLE_IDX(addr) __BM_TABLE_IDX(addr, PMD_SHIFT) static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __page_aligned_bss; static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused; static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; static inline pte_t *fixmap_pte(unsigned long addr) { return &bm_pte[BM_PTE_TABLE_IDX(addr)][pte_index(addr)]; } static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr) { pmd_t pmd = READ_ONCE(*pmdp); pte_t *ptep; if (pmd_none(pmd)) { ptep = bm_pte[BM_PTE_TABLE_IDX(addr)]; __pmd_populate(pmdp, __pa_symbol(ptep), PMD_TYPE_TABLE | PMD_TABLE_AF); } } static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr, unsigned long end) { unsigned long next; pud_t pud = READ_ONCE(*pudp); pmd_t *pmdp; if (pud_none(pud)) __pud_populate(pudp, __pa_symbol(bm_pmd), PUD_TYPE_TABLE | PUD_TABLE_AF); pmdp = pmd_offset_kimg(pudp, addr); do { next = pmd_addr_end(addr, end); early_fixmap_init_pte(pmdp, addr); } while (pmdp++, addr = next, addr != end); } static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end) { p4d_t p4d = READ_ONCE(*p4dp); pud_t *pudp; if (CONFIG_PGTABLE_LEVELS > 3 && !p4d_none(p4d) && p4d_page_paddr(p4d) != __pa_symbol(bm_pud)) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on * 16k/4 levels configurations. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); } if (p4d_none(p4d)) __p4d_populate(p4dp, __pa_symbol(bm_pud), P4D_TYPE_TABLE | P4D_TABLE_AF); pudp = pud_offset_kimg(p4dp, addr); early_fixmap_init_pmd(pudp, addr, end); } /* * The p*d_populate functions call virt_to_phys implicitly so they can't be used * directly on kernel symbols (bm_p*d). This function is called too early to use * lm_alias so __p*d_populate functions must be used to populate with the * physical address from __pa_symbol. */ void __init early_fixmap_init(void) { unsigned long addr = FIXADDR_TOT_START; unsigned long end = FIXADDR_TOP; pgd_t *pgdp = pgd_offset_k(addr); p4d_t *p4dp = p4d_offset_kimg(pgdp, addr); early_fixmap_init_pud(p4dp, addr, end); } /* * Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we * ever need to use IPIs for TLB broadcasting, then we're in trouble here. */ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); pte_t *ptep; BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); ptep = fixmap_pte(addr); if (pgprot_val(flags)) { __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); } else { __pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr+PAGE_SIZE); } } void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) { const u64 dt_virt_base = __fix_to_virt(FIX_FDT); phys_addr_t dt_phys_base; int offset; void *dt_virt; /* * Check whether the physical FDT address is set and meets the minimum * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be * at least 8 bytes so that we can always access the magic and size * fields of the FDT header after mapping the first chunk, double check * here if that is indeed the case. */ BUILD_BUG_ON(MIN_FDT_ALIGN < 8); if (!dt_phys || dt_phys % MIN_FDT_ALIGN) return NULL; dt_phys_base = round_down(dt_phys, PAGE_SIZE); offset = dt_phys % PAGE_SIZE; dt_virt = (void *)dt_virt_base + offset; /* map the first chunk so we can read the size from the header */ create_mapping_noalloc(dt_phys_base, dt_virt_base, PAGE_SIZE, prot); if (fdt_magic(dt_virt) != FDT_MAGIC) return NULL; *size = fdt_totalsize(dt_virt); if (*size > MAX_FDT_SIZE) return NULL; if (offset + *size > PAGE_SIZE) { create_mapping_noalloc(dt_phys_base, dt_virt_base, offset + *size, prot); } return dt_virt; }
6 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/dsa/user.c - user device handling * Copyright (c) 2008-2009 Marvell Semiconductor */ #include <linux/list.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #include <linux/phylink.h> #include <linux/of_net.h> #include <linux/of_mdio.h> #include <linux/mdio.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/selftests.h> #include <net/tc_act/tc_mirred.h> #include <linux/if_bridge.h> #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> #include <linux/string.h> #include "conduit.h" #include "dsa.h" #include "netlink.h" #include "port.h" #include "switch.h" #include "tag.h" #include "user.h" struct dsa_switchdev_event_work { struct net_device *dev; struct net_device *orig_dev; struct work_struct work; unsigned long event; /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and * SWITCHDEV_FDB_DEL_TO_DEVICE */ unsigned char addr[ETH_ALEN]; u16 vid; bool host_addr; }; enum dsa_standalone_event { DSA_UC_ADD, DSA_UC_DEL, DSA_MC_ADD, DSA_MC_DEL, }; struct dsa_standalone_event_work { struct work_struct work; struct net_device *dev; enum dsa_standalone_event event; unsigned char addr[ETH_ALEN]; u16 vid; }; struct dsa_host_vlan_rx_filtering_ctx { struct net_device *dev; const unsigned char *addr; enum dsa_standalone_event event; }; static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) { return ds->ops->port_fdb_add && ds->ops->port_fdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) { return ds->ops->port_mdb_add && ds->ops->port_mdb_del && ds->fdb_isolation && !ds->vlan_filtering_is_global && !ds->needs_standalone_vlan_filtering; } static void dsa_user_standalone_event_work(struct work_struct *work) { struct dsa_standalone_event_work *standalone_work = container_of(work, struct dsa_standalone_event_work, work); const unsigned char *addr = standalone_work->addr; struct net_device *dev = standalone_work->dev; struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_mdb mdb; struct dsa_switch *ds = dp->ds; u16 vid = standalone_work->vid; int err; switch (standalone_work->event) { case DSA_UC_ADD: err = dsa_port_standalone_host_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_UC_DEL: err = dsa_port_standalone_host_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; case DSA_MC_ADD: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_add(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to mdb: %d\n", dp->index, addr, vid, err); break; } break; case DSA_MC_DEL: ether_addr_copy(mdb.addr, addr); mdb.vid = vid; err = dsa_port_standalone_host_mdb_del(dp, &mdb); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from mdb: %d\n", dp->index, addr, vid, err); } break; } kfree(standalone_work); } static int dsa_user_schedule_standalone_work(struct net_device *dev, enum dsa_standalone_event event, const unsigned char *addr, u16 vid) { struct dsa_standalone_event_work *standalone_work; standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC); if (!standalone_work) return -ENOMEM; INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work); standalone_work->event = event; standalone_work->dev = dev; ether_addr_copy(standalone_work->addr, addr); standalone_work->vid = vid; dsa_schedule_work(&standalone_work->work); return 0; } static int dsa_user_host_vlan_rx_filtering(void *arg, int vid) { struct dsa_host_vlan_rx_filtering_ctx *ctx = arg; return dsa_user_schedule_standalone_work(ctx->dev, ctx->event, ctx->addr, vid); } static int dsa_user_vlan_for_each(struct net_device *dev, int (*cb)(void *arg, int vid), void *arg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_vlan *v; int err; lockdep_assert_held(&dev->addr_list_lock); err = cb(arg, 0); if (err) return err; list_for_each_entry(v, &dp->user_vlans, list) { err = cb(arg, v->vid); if (err) return err; } return 0; } static int dsa_user_sync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_ADD, }; dev_uc_add(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_uc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_UC_DEL, }; dev_uc_del(conduit, addr); if (!dsa_switch_supports_uc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_sync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_ADD, }; dev_mc_add(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } static int dsa_user_unsync_mc(struct net_device *dev, const unsigned char *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_host_vlan_rx_filtering_ctx ctx = { .dev = dev, .addr = addr, .event = DSA_MC_DEL, }; dev_mc_del(conduit, addr); if (!dsa_switch_supports_mc_filtering(dp->ds)) return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, &ctx); } void dsa_user_sync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_sync_mc(dev, ha->addr); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_sync_uc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } void dsa_user_unsync_ha(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; netif_addr_lock_bh(dev); netdev_for_each_synced_uc_addr(ha, dev) dsa_user_unsync_uc(dev, ha->addr); netdev_for_each_synced_mc_addr(ha, dev) dsa_user_unsync_mc(dev, ha->addr); netif_addr_unlock_bh(dev); if (dsa_switch_supports_uc_filtering(ds) || dsa_switch_supports_mc_filtering(ds)) dsa_flush_workqueue(); } /* user mii_bus handling ***************************************************/ static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_read(ds, addr, reg); return 0xffff; } static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val) { struct dsa_switch *ds = bus->priv; if (ds->phys_mii_mask & (1 << addr)) return ds->ops->phy_write(ds, addr, reg, val); return 0; } void dsa_user_mii_bus_init(struct dsa_switch *ds) { ds->user_mii_bus->priv = (void *)ds; ds->user_mii_bus->name = "dsa user smi"; ds->user_mii_bus->read = dsa_user_phy_read; ds->user_mii_bus->write = dsa_user_phy_write; snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d", ds->dst->index, ds->index); ds->user_mii_bus->parent = ds->dev; ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask; } /* user device handling ****************************************************/ static int dsa_user_get_iflink(const struct net_device *dev) { return READ_ONCE(dsa_user_to_conduit(dev)->ifindex); } int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err; if (dsa_switch_supports_uc_filtering(ds)) { err = dsa_port_standalone_host_fdb_add(dp, addr, 0); if (err) goto out; } if (!ether_addr_equal(addr, conduit->dev_addr)) { err = dev_uc_add(conduit, addr); if (err < 0) goto del_host_addr; } return 0; del_host_addr: if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, addr, 0); out: return err; } void dsa_user_host_uc_uninstall(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr)) dev_uc_del(conduit, dev->dev_addr); if (dsa_switch_supports_uc_filtering(ds)) dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0); } static int dsa_user_open(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); int err; err = dev_open(conduit, NULL); if (err < 0) { netdev_err(dev, "failed to open conduit %s\n", conduit->name); goto out; } err = dsa_user_host_uc_install(dev, dev->dev_addr); if (err) goto out; err = dsa_port_enable_rt(dp, dev->phydev); if (err) goto out_del_host_uc; return 0; out_del_host_uc: dsa_user_host_uc_uninstall(dev); out: return err; } static int dsa_user_close(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); dsa_port_disable_rt(dp); dsa_user_host_uc_uninstall(dev); return 0; } static void dsa_user_manage_host_flood(struct net_device *dev) { bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI); struct dsa_port *dp = dsa_user_to_port(dev); bool uc = dev->flags & IFF_PROMISC; dsa_port_set_host_flood(dp, uc, mc); } static void dsa_user_change_rx_flags(struct net_device *dev, int change) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (change & IFF_ALLMULTI) dev_set_allmulti(conduit, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(conduit, dev->flags & IFF_PROMISC ? 1 : -1); if (dsa_switch_supports_uc_filtering(ds) && dsa_switch_supports_mc_filtering(ds)) dsa_user_manage_host_flood(dev); } static void dsa_user_set_rx_mode(struct net_device *dev) { __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc); __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc); } static int dsa_user_set_mac_address(struct net_device *dev, void *a) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct sockaddr *addr = a; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (ds->ops->port_set_mac_address) { err = ds->ops->port_set_mac_address(ds, dp->index, addr->sa_data); if (err) return err; } /* If the port is down, the address isn't synced yet to hardware or * to the DSA conduit, so there is nothing to change. */ if (!(dev->flags & IFF_UP)) goto out_change_dev_addr; err = dsa_user_host_uc_install(dev, addr->sa_data); if (err) return err; dsa_user_host_uc_uninstall(dev); out_change_dev_addr: eth_hw_addr_set(dev, addr->sa_data); return 0; } struct dsa_user_dump_ctx { struct net_device *dev; struct sk_buff *skb; struct netlink_callback *cb; int idx; }; static int dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, sizeof(*ndm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dump->dev->ifindex; ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr)) goto nla_put_failure; if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid)) goto nla_put_failure; nlmsg_end(dump->skb, nlh); skip: dump->idx++; return 0; nla_put_failure: nlmsg_cancel(dump->skb, nlh); return -EMSGSIZE; } static int dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_dump_ctx dump = { .dev = dev, .skb = skb, .cb = cb, .idx = *idx, }; int err; err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump); *idx = dump.idx; return err; } static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx, const struct switchdev_attr *attr, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int ret; if (ctx && ctx != dp) return 0; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_PORT_MST_STATE: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering, extack); break; case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_ageing_time(dp, attr->u.ageing_time); break; case SWITCHDEV_ATTR_ID_BRIDGE_MST: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_mst_enable(dp, attr->u.mst, extack); break; case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS: if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; case SWITCHDEV_ATTR_ID_VLAN_MSTI: if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev)) return -EOPNOTSUPP; ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti); break; default: ret = -EOPNOTSUPP; break; } return ret; } /* Must be called under rcu_read_lock() */ static int dsa_user_vlan_check_for_8021q_uppers(struct net_device *user, const struct switchdev_obj_port_vlan *vlan) { struct net_device *upper_dev; struct list_head *iter; netdev_for_each_upper_dev_rcu(user, upper_dev, iter) { u16 vid; if (!is_vlan_dev(upper_dev)) continue; vid = vlan_dev_vlan_id(upper_dev); if (vid == vlan->vid) return -EBUSY; } return 0; } static int dsa_user_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; int err; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); /* Deny adding a bridge VLAN when there is already an 802.1Q upper with * the same VID. */ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) { rcu_read_lock(); err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG_MOD(extack, "Port already has a VLAN upper with this VID"); return err; } } return dsa_port_vlan_add(dp, vlan, extack); } /* Offload a VLAN installed on the bridge or on a foreign interface by * installing it as a VLAN towards the CPU port. */ static int dsa_user_host_vlan_add(struct net_device *dev, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) { NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN"); return 0; } vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); /* Even though drivers often handle CPU membership in special ways, * it doesn't make sense to program a PVID, so clear this flag. */ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID; return dsa_port_host_vlan_add(dp, &vlan, extack); } static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_add(dev, obj, extack); else err = dsa_user_host_vlan_add(dev, obj, extack); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_add_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static int dsa_user_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_vlan_del(dp, vlan); } static int dsa_user_host_vlan_del(struct net_device *dev, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan *vlan; /* Do nothing if this is a software bridge */ if (!dp->bridge) return -EOPNOTSUPP; if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = SWITCHDEV_OBJ_PORT_VLAN(obj); return dsa_port_host_vlan_del(dp, vlan); } static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj) { struct dsa_port *dp = dsa_user_to_port(dev); int err; if (ctx && ctx != dp) return 0; switch (obj->id) { case SWITCHDEV_OBJ_ID_PORT_MDB: if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_HOST_MDB: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj)); break; case SWITCHDEV_OBJ_ID_PORT_VLAN: if (dsa_port_offloads_bridge_port(dp, obj->orig_dev)) err = dsa_user_vlan_del(dev, obj); else err = dsa_user_host_vlan_del(dev, obj); break; case SWITCHDEV_OBJ_ID_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj)); break; case SWITCHDEV_OBJ_ID_RING_ROLE_MRP: if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev)) return -EOPNOTSUPP; err = dsa_port_mrp_del_ring_role(dp, SWITCHDEV_OBJ_RING_ROLE_MRP(obj)); break; default: err = -EOPNOTSUPP; break; } return err; } static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_user_priv *p = netdev_priv(dev); return netpoll_send_skb(p->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, struct sk_buff *skb) { struct dsa_switch *ds = p->dp->ds; if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF)) return; if (!ds->ops->port_txtstamp) return; ds->ops->port_txtstamp(ds, p->dp->index, skb); } netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev) { /* SKB for netpoll still need to be mangled with the protocol-specific * tag to be successfully transmitted */ if (unlikely(netpoll_tx_running(dev))) return dsa_user_netpoll_send_skb(dev, skb); /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ skb->dev = dsa_user_to_conduit(dev); dev_queue_xmit(skb); return NETDEV_TX_OK; } EXPORT_SYMBOL_GPL(dsa_enqueue_skb); static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct sk_buff *nskb; dev_sw_netstats_tx_add(dev, 1, skb->len); memset(skb->cb, 0, sizeof(skb->cb)); /* Handle tx timestamp if any */ dsa_skb_tx_timestamp(p, skb); if (skb_ensure_writable_head_tail(skb, dev)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* needed_tailroom should still be 'warm' in the cache line from * skb_ensure_writable_head_tail(), which has also ensured that * padding is safe. */ if (dev->needed_tailroom) eth_skb_pad(skb); /* Transmit function may have to reallocate the original SKB, * in which case it must have freed it. Only free it here on error. */ nskb = p->xmit(skb, dev); if (!nskb) { kfree_skb(skb); return NETDEV_TX_OK; } return dsa_enqueue_skb(nskb, dev); } /* ethtool operations *******************************************************/ static void dsa_user_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver)); strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version)); strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info)); } static int dsa_user_get_regs_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs_len) return ds->ops->get_regs_len(ds, dp->index); return -EOPNOTSUPP; } static void dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_regs) ds->ops->get_regs(ds, dp->index, regs, _p); } static int dsa_user_nway_reset(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_nway_reset(dp->pl); } static int dsa_user_get_eeprom_len(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->cd && ds->cd->eeprom_len) return ds->cd->eeprom_len; if (ds->ops->get_eeprom_len) return ds->ops->get_eeprom_len(ds); return 0; } static int dsa_user_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eeprom) return ds->ops->get_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static int dsa_user_set_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->set_eeprom) return ds->ops->set_eeprom(ds, eeprom, data); return -EOPNOTSUPP; } static void dsa_user_get_strings(struct net_device *dev, uint32_t stringset, uint8_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (stringset == ETH_SS_STATS) { ethtool_puts(&data, "tx_packets"); ethtool_puts(&data, "tx_bytes"); ethtool_puts(&data, "rx_packets"); ethtool_puts(&data, "rx_bytes"); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data); } else if (stringset == ETH_SS_TEST) { net_selftest_get_strings(data); } } static void dsa_user_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct pcpu_sw_netstats *s; unsigned int start; int i; for_each_possible_cpu(i) { u64 tx_packets, tx_bytes, rx_packets, rx_bytes; s = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; data[3] += rx_bytes; } if (ds->ops->get_ethtool_stats) ds->ops->get_ethtool_stats(ds, dp->index, data + 4); } static int dsa_user_get_sset_count(struct net_device *dev, int sset) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (sset == ETH_SS_STATS) { int count = 0; if (ds->ops->get_sset_count) { count = ds->ops->get_sset_count(ds, dp->index, sset); if (count < 0) return count; } return count + 4; } else if (sset == ETH_SS_TEST) { return net_selftest_get_count(); } return -EOPNOTSUPP; } static void dsa_user_get_eth_phy_stats(struct net_device *dev, struct ethtool_eth_phy_stats *phy_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_phy_stats) ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); } static void dsa_user_get_eth_mac_stats(struct net_device *dev, struct ethtool_eth_mac_stats *mac_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_mac_stats) ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); } static void dsa_user_get_eth_ctrl_stats(struct net_device *dev, struct ethtool_eth_ctrl_stats *ctrl_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_eth_ctrl_stats) ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); } static void dsa_user_get_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats, const struct ethtool_rmon_hist_range **ranges) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_rmon_stats) ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); } static void dsa_user_get_ts_stats(struct net_device *dev, struct ethtool_ts_stats *ts_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_ts_stats) ds->ops->get_ts_stats(ds, dp->index, ts_stats); } static void dsa_user_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { struct dsa_port *dp = dsa_user_to_port(ndev); struct dsa_switch *ds = dp->ds; if (ds->ops->self_test) { ds->ops->self_test(ds, dp->index, etest, buf); return; } net_selftest(ndev, etest, buf); } static int dsa_user_get_mm(struct net_device *dev, struct ethtool_mm_state *state) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_mm) return -EOPNOTSUPP; return ds->ops->get_mm(ds, dp->index, state); } static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_mm) return -EOPNOTSUPP; return ds->ops->set_mm(ds, dp->index, cfg, extack); } static void dsa_user_get_mm_stats(struct net_device *dev, struct ethtool_mm_stats *stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_mm_stats) ds->ops->get_mm_stats(ds, dp->index, stats); } static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; phylink_ethtool_get_wol(dp->pl, w); if (ds->ops->get_wol) ds->ops->get_wol(ds, dp->index, w); } static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret = -EOPNOTSUPP; phylink_ethtool_set_wol(dp->pl, w); if (ds->ops->set_wol) ret = ds->ops->set_wol(ds, dp->index, w); return ret; } static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int ret; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* If the port is using phylink managed EEE, then an unimplemented * set_mac_eee() is permissible. */ if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) { /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; if (!ds->ops->set_mac_eee) return -EOPNOTSUPP; ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } else if (ds->ops->set_mac_eee) { ret = ds->ops->set_mac_eee(ds, dp->index, e); if (ret) return ret; } return phylink_ethtool_set_eee(dp->pl, e); } static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; /* Check whether the switch supports EEE */ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; /* Port's PHY and MAC both need to be EEE capable */ if (!dev->phydev) return -ENODEV; return phylink_ethtool_get_eee(dp->pl, e); } static int dsa_user_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_get(dp->pl, cmd); } static int dsa_user_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_ksettings_set(dp->pl, cmd); } static void dsa_user_get_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_pause_stats) ds->ops->get_pause_stats(ds, dp->index, pause_stats); } static void dsa_user_get_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); phylink_ethtool_get_pauseparam(dp->pl, pause); } static int dsa_user_set_pauseparam(struct net_device *dev, struct ethtool_pauseparam *pause) { struct dsa_port *dp = dsa_user_to_port(dev); return phylink_ethtool_set_pauseparam(dp->pl, pause); } #ifdef CONFIG_NET_POLL_CONTROLLER static int dsa_user_netpoll_setup(struct net_device *dev) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); if (!netpoll) return -ENOMEM; err = __netpoll_setup(netpoll, conduit); if (err) { kfree(netpoll); goto out; } p->netpoll = netpoll; out: return err; } static void dsa_user_netpoll_cleanup(struct net_device *dev) { struct dsa_user_priv *p = netdev_priv(dev); struct netpoll *netpoll = p->netpoll; if (!netpoll) return; p->netpoll = NULL; __netpoll_free(netpoll); } static void dsa_user_poll_controller(struct net_device *dev) { } #endif static struct dsa_mall_tc_entry * dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_tc_entry *mall_tc_entry; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) if (mall_tc_entry->cookie == cookie) return mall_tc_entry; return NULL; } static int dsa_user_add_cls_matchall_mirred(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress, bool ingress_target) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_mirror_tc_entry *mirror; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; struct dsa_port *to_dp; int err; if (cls->common.protocol != htons(ETH_P_ALL)) { NL_SET_ERR_MSG_MOD(extack, "Can only offload \"protocol all\" matchall filter"); return -EOPNOTSUPP; } if (!ds->ops->port_mirror_add) { NL_SET_ERR_MSG_MOD(extack, "Switch does not support mirroring operation"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; act = &cls->rule->action.entries[0]; if (!act->dev) return -EINVAL; if (dsa_user_dev_check(act->dev)) { if (ingress_target) { /* We can only fulfill this using software assist */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to ingress of DSA user port if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } else { to_dp = dsa_user_to_port(act->dev); } } else { /* Handle mirroring to foreign target ports as a mirror towards * the CPU. The software tc rule will take the packets from * there. */ if (cls->common.skip_sw) { NL_SET_ERR_MSG_MOD(extack, "Can only mirred to CPU if filter also runs in software"); return -EOPNOTSUPP; } to_dp = dp->cpu_dp; } if (dp->ds != to_dp->ds) { NL_SET_ERR_MSG_MOD(extack, "Cross-chip mirroring not implemented"); return -EOPNOTSUPP; } mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_MIRROR; mirror = &mall_tc_entry->mirror; mirror->to_local_port = to_dp->index; mirror->ingress = ingress; err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall_police(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, "Only supported on ingress qdisc"); return -EOPNOTSUPP; } if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack)) return -EOPNOTSUPP; list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, "Only one port policer allowed"); return -EEXIST; } } act = &cls->rule->action.entries[0]; mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL); if (!mall_tc_entry) return -ENOMEM; mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; policer->rate_bytes_per_sec = act->police.rate_bytes_ps; policer->burst = act->police.burst; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { kfree(mall_tc_entry); return err; } list_add_tail(&mall_tc_entry->list, &p->mall_tc_list); return err; } static int dsa_user_add_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { const struct flow_action *action = &cls->rule->action; struct netlink_ext_ack *extack = cls->common.extack; if (!flow_offload_has_one_action(action)) { NL_SET_ERR_MSG_MOD(extack, "Cannot offload matchall filter with more than one action"); return -EOPNOTSUPP; } switch (action->entries[0].id) { case FLOW_ACTION_MIRRED: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, false); case FLOW_ACTION_MIRRED_INGRESS: return dsa_user_add_cls_matchall_mirred(dev, cls, ingress, true); case FLOW_ACTION_POLICE: return dsa_user_add_cls_matchall_police(dev, cls, ingress); default: NL_SET_ERR_MSG_MOD(extack, "Unknown action"); break; } return -EOPNOTSUPP; } static void dsa_user_del_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_mall_tc_entry *mall_tc_entry; struct dsa_switch *ds = dp->ds; mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie); if (!mall_tc_entry) return; list_del(&mall_tc_entry->list); switch (mall_tc_entry->type) { case DSA_PORT_MALL_MIRROR: if (ds->ops->port_mirror_del) ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror); break; case DSA_PORT_MALL_POLICER: if (ds->ops->port_policer_del) ds->ops->port_policer_del(ds, dp->index); break; default: WARN_ON(1); } kfree(mall_tc_entry); } static int dsa_user_setup_tc_cls_matchall(struct net_device *dev, struct tc_cls_matchall_offload *cls, bool ingress) { if (cls->common.chain_index) return -EOPNOTSUPP; switch (cls->command) { case TC_CLSMATCHALL_REPLACE: return dsa_user_add_cls_matchall(dev, cls, ingress); case TC_CLSMATCHALL_DESTROY: dsa_user_del_cls_matchall(dev, cls); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_add_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_add) return -EOPNOTSUPP; return ds->ops->cls_flower_add(ds, port, cls, ingress); } static int dsa_user_del_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_del) return -EOPNOTSUPP; return ds->ops->cls_flower_del(ds, port, cls, ingress); } static int dsa_user_stats_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->cls_flower_stats) return -EOPNOTSUPP; return ds->ops->cls_flower_stats(ds, port, cls, ingress); } static int dsa_user_setup_tc_cls_flower(struct net_device *dev, struct flow_cls_offload *cls, bool ingress) { switch (cls->command) { case FLOW_CLS_REPLACE: return dsa_user_add_cls_flower(dev, cls, ingress); case FLOW_CLS_DESTROY: return dsa_user_del_cls_flower(dev, cls, ingress); case FLOW_CLS_STATS: return dsa_user_stats_cls_flower(dev, cls, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv, bool ingress) { struct net_device *dev = cb_priv; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSMATCHALL: return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress); case TC_SETUP_CLSFLOWER: return dsa_user_setup_tc_cls_flower(dev, type_data, ingress); default: return -EOPNOTSUPP; } } static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true); } static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type, void *type_data, void *cb_priv) { return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false); } static LIST_HEAD(dsa_user_block_cb_list); static int dsa_user_setup_tc_block(struct net_device *dev, struct flow_block_offload *f) { struct flow_block_cb *block_cb; flow_setup_cb_t *cb; if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) cb = dsa_user_setup_tc_block_cb_ig; else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) cb = dsa_user_setup_tc_block_cb_eg; else return -EOPNOTSUPP; f->driver_block_list = &dsa_user_block_cb_list; switch (f->command) { case FLOW_BLOCK_BIND: if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list)) return -EBUSY; block_cb = flow_block_cb_alloc(cb, dev, dev, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list); return 0; case FLOW_BLOCK_UNBIND: block_cb = flow_block_cb_lookup(f->block, cb, dev); if (!block_cb) return -ENOENT; flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; } } static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port, void *type_data) { struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port)); if (!conduit->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data); } static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; switch (type) { case TC_SETUP_BLOCK: return dsa_user_setup_tc_block(dev, type_data); case TC_SETUP_FT: return dsa_user_setup_ft_block(ds, dp->index, type_data); default: break; } if (!ds->ops->port_setup_tc) return -EOPNOTSUPP; return ds->ops->port_setup_tc(ds, dp->index, type, type_data); } static int dsa_user_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc, u32 *rule_locs) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->get_rxnfc) return -EOPNOTSUPP; return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs); } static int dsa_user_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *nfc) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->set_rxnfc) return -EOPNOTSUPP; return ds->ops->set_rxnfc(ds, dp->index, nfc); } static int dsa_user_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *ts) { struct dsa_user_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->dp->ds; if (!ds->ops->get_ts_info) return -EOPNOTSUPP; return ds->ops->get_ts_info(ds, p->dp->index, ts); } static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct netlink_ext_ack extack = {0}; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int ret; /* User port... */ ret = dsa_port_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "%s\n", extack._msg); return ret; } /* And CPU port... */ ret = dsa_port_host_vlan_add(dp, &vlan, &extack); if (ret) { if (extack._msg) netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index, extack._msg); return ret; } if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; v = kzalloc(sizeof(*v), GFP_KERNEL); if (!v) { ret = -ENOMEM; goto rollback; } netif_addr_lock_bh(dev); v->vid = vid; list_add_tail(&v->list, &dp->user_vlans); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_ADD, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_ADD, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; rollback: dsa_port_host_vlan_del(dp, &vlan); dsa_port_vlan_del(dp, &vlan); return ret; } static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) { struct dsa_port *dp = dsa_user_to_port(dev); struct switchdev_obj_port_vlan vlan = { .vid = vid, /* This API only allows programming tagged, non-PVID VIDs */ .flags = 0, }; struct dsa_switch *ds = dp->ds; struct netdev_hw_addr *ha; struct dsa_vlan *v; int err; err = dsa_port_vlan_del(dp, &vlan); if (err) return err; err = dsa_port_host_vlan_del(dp, &vlan); if (err) return err; if (!dsa_switch_supports_uc_filtering(ds) && !dsa_switch_supports_mc_filtering(ds)) return 0; netif_addr_lock_bh(dev); v = dsa_vlan_find(&dp->user_vlans, &vlan); if (!v) { netif_addr_unlock_bh(dev); return -ENOENT; } list_del(&v->list); kfree(v); if (dsa_switch_supports_mc_filtering(ds)) { netdev_for_each_synced_mc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_MC_DEL, ha->addr, vid); } } if (dsa_switch_supports_uc_filtering(ds)) { netdev_for_each_synced_uc_addr(ha, dev) { dsa_user_schedule_standalone_work(dev, DSA_UC_DEL, ha->addr, vid); } } netif_addr_unlock_bh(dev); dsa_flush_workqueue(); return 0; } static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_add_vid(arg, proto, vid); } static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg) { __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); return dsa_user_vlan_rx_kill_vid(arg, proto, vid); } /* Keep the VLAN RX filtering list in sync with the hardware only if VLAN * filtering is enabled. The baseline is that only ports that offload a * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, * but there are exceptions for quirky hardware. * * If ds->vlan_filtering_is_global = true, then standalone ports which share * the same switch with other ports that offload a VLAN-aware bridge are also * inevitably VLAN-aware. * * To summarize, a DSA switch port offloads: * * - If standalone (this includes software bridge, software LAG): * - if ds->needs_standalone_vlan_filtering = true, OR if * (ds->vlan_filtering_is_global = true AND there are bridges spanning * this switch chip which have vlan_filtering=1) * - the 8021q upper VLANs * - else (standalone VLAN filtering is not needed, VLAN filtering is not * global, or it is, but no port is under a VLAN-aware bridge): * - no VLAN (any 8021q upper is a software VLAN) * * - If under a vlan_filtering=0 bridge which it offload: * - if ds->configure_vlan_while_not_filtering = true (default): * - the bridge VLANs. These VLANs are committed to hardware but inactive. * - else (deprecated): * - no VLAN. The bridge VLANs are not restored when VLAN awareness is * enabled, so this behavior is broken and discouraged. * * - If under a vlan_filtering=1 bridge which it offload: * - the bridge VLANs * - the 8021q upper VLANs */ int dsa_user_manage_vlan_filtering(struct net_device *user, bool vlan_filtering) { int err; if (vlan_filtering) { user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; err = vlan_for_each(user, dsa_user_restore_vlan, user); if (err) { vlan_for_each(user, dsa_user_clear_vlan, user); user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; return err; } } else { err = vlan_for_each(user, dsa_user_clear_vlan, user); if (err) return err; user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } return 0; } struct dsa_hw_port { struct list_head list; struct net_device *dev; int old_mtu; }; static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu) { const struct dsa_hw_port *p; int err; list_for_each_entry(p, hw_port_list, list) { if (p->dev->mtu == mtu) continue; err = dev_set_mtu(p->dev, mtu); if (err) goto rollback; } return 0; rollback: list_for_each_entry_continue_reverse(p, hw_port_list, list) { if (p->dev->mtu == p->old_mtu) continue; if (dev_set_mtu(p->dev, p->old_mtu)) netdev_err(p->dev, "Failed to restore MTU\n"); } return err; } static void dsa_hw_port_list_free(struct list_head *hw_port_list) { struct dsa_hw_port *p, *n; list_for_each_entry_safe(p, n, hw_port_list, list) kfree(p); } /* Make the hardware datapath to/from @dev limited to a common MTU */ static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; int min_mtu = ETH_MAX_MTU; struct dsa_port *other_dp; int err; if (!dp->ds->mtu_enforcement_ingress) return; if (!dp->bridge) return; INIT_LIST_HEAD(&hw_port_list); /* Populate the list of ports that are part of the same bridge * as the newly added/modified port */ list_for_each_entry(dst, &dsa_tree_list, list) { list_for_each_entry(other_dp, &dst->ports, list) { struct dsa_hw_port *hw_port; struct net_device *user; if (other_dp->type != DSA_PORT_TYPE_USER) continue; if (!dsa_port_bridge_same(dp, other_dp)) continue; if (!other_dp->ds->mtu_enforcement_ingress) continue; user = other_dp->user; if (min_mtu > user->mtu) min_mtu = user->mtu; hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL); if (!hw_port) goto out; hw_port->dev = user; hw_port->old_mtu = user->mtu; list_add(&hw_port->list, &hw_port_list); } } /* Attempt to configure the entire hardware bridge to the newly added * interface's MTU first, regardless of whether the intention of the * user was to raise or lower it. */ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu); if (!err) goto out; /* Clearly that didn't work out so well, so just set the minimum MTU on * all hardware bridge ports now. If this fails too, then all ports will * still have their old MTU rolled back anyway. */ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu); out: dsa_hw_port_list_free(&hw_port_list); } int dsa_user_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_port *cpu_dp = dp->cpu_dp; struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int largest_mtu = 0; int new_conduit_mtu; int old_conduit_mtu; int mtu_limit; int overhead; int cpu_mtu; int err; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; dsa_tree_for_each_user_port(other_dp, ds->dst) { int user_mtu; /* During probe, this function will be called for each user * device, while not all of them have been allocated. That's * ok, it doesn't change what the maximum is, so ignore it. */ if (!other_dp->user) continue; /* Pretend that we already applied the setting, which we * actually haven't (still haven't done all integrity checks) */ if (dp == other_dp) user_mtu = new_mtu; else user_mtu = other_dp->user->mtu; if (largest_mtu < user_mtu) largest_mtu = user_mtu; } overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops); mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead); old_conduit_mtu = conduit->mtu; new_conduit_mtu = largest_mtu + overhead; if (new_conduit_mtu > mtu_limit) return -ERANGE; /* If the conduit MTU isn't over limit, there's no need to check the CPU * MTU, since that surely isn't either. */ cpu_mtu = largest_mtu; /* Start applying stuff */ if (new_conduit_mtu != old_conduit_mtu) { err = dev_set_mtu(conduit, new_conduit_mtu); if (err < 0) goto out_conduit_failed; /* We only need to propagate the MTU of the CPU port to * upstream switches, so emit a notifier which updates them. */ err = dsa_port_mtu_change(cpu_dp, cpu_mtu); if (err) goto out_cpu_failed; } err = ds->ops->port_change_mtu(ds, dp->index, new_mtu); if (err) goto out_port_failed; WRITE_ONCE(dev->mtu, new_mtu); dsa_bridge_mtu_normalization(dp); return 0; out_port_failed: if (new_conduit_mtu != old_conduit_mtu) dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead); out_cpu_failed: if (new_conduit_mtu != old_conduit_mtu) dev_set_mtu(conduit, old_conduit_mtu); out_conduit_failed: return err; } static int __maybe_unused dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_set_apptrust) return -EOPNOTSUPP; return ds->ops->port_set_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; if (!ds->ops->port_get_apptrust) return -EOPNOTSUPP; return ds->ops->port_get_apptrust(ds, port, sel, nsel); } static int __maybe_unused dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } return 0; } /* Update the DSCP prio entries on all user ports of the switch in case * the switch supports global DSCP prio instead of per port DSCP prios. */ static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev, struct dcb_app *app, bool del) { int (*setdel)(struct net_device *dev, struct dcb_app *app); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct dsa_port *other_dp; int err, restore_err; if (del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; err = setdel(user, app); if (err) goto err_try_to_restore; } return 0; err_try_to_restore: /* Revert logic to restore previous state of app entries */ if (!del) setdel = dcb_ieee_delapp; else setdel = dcb_ieee_setapp; dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) { struct net_device *user = other_dp->user; if (!user || user == dev) continue; restore_err = setdel(user, app); if (restore_err) netdev_err(user, "Failed to restore DSCP prio entry configuration\n"); } return err; } static int __maybe_unused dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_add_dscp_prio) return -EOPNOTSUPP; if (dscp >= 64) { netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n", dscp); return -EINVAL; } err = dcb_ieee_setapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = __fls(mask); err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio); if (err) { dcb_ieee_delapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false); if (err) { if (ds->ops->port_del_dscp_prio) ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio); dcb_ieee_delapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_set_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_add_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } static int __maybe_unused dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; unsigned long mask, new_prio; int err, port = dp->index; if (!ds->ops->port_set_default_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; mask = dcb_ieee_getapp_mask(dev, app); new_prio = mask ? __fls(mask) : 0; err = ds->ops->port_set_default_prio(ds, port, new_prio); if (err) { dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int err, port = dp->index; u8 dscp = app->protocol; if (!ds->ops->port_del_dscp_prio) return -EOPNOTSUPP; err = dcb_ieee_delapp(dev, app); if (err) return err; err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority); if (err) { dcb_ieee_setapp(dev, app); return err; } if (!ds->dscp_prio_mapping_is_global) return 0; err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true); if (err) { if (ds->ops->port_add_dscp_prio) ds->ops->port_add_dscp_prio(ds, port, dscp, app->priority); dcb_ieee_setapp(dev, app); return err; } return 0; } static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { switch (app->selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: switch (app->protocol) { case 0: return dsa_user_dcbnl_del_default_prio(dev, app); default: return -EOPNOTSUPP; } break; case IEEE_8021QAZ_APP_SEL_DSCP: return dsa_user_dcbnl_del_dscp_prio(dev, app); default: return -EOPNOTSUPP; } } /* Pre-populate the DCB application priority table with the priorities * configured during switch setup, which we read from hardware here. */ static int dsa_user_dcbnl_init(struct net_device *dev) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; int port = dp->index; int err; if (ds->ops->port_get_default_prio) { int prio = ds->ops->port_get_default_prio(ds, port); struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE, .protocol = 0, .priority = prio, }; if (prio < 0) return prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } if (ds->ops->port_get_dscp_prio) { int protocol; for (protocol = 0; protocol < 64; protocol++) { struct dcb_app app = { .selector = IEEE_8021QAZ_APP_SEL_DSCP, .protocol = protocol, }; int prio; prio = ds->ops->port_get_dscp_prio(ds, port, protocol); if (prio == -EOPNOTSUPP) continue; if (prio < 0) return prio; app.priority = prio; err = dcb_ieee_setapp(dev, &app); if (err) return err; } } return 0; } static const struct ethtool_ops dsa_user_ethtool_ops = { .get_drvinfo = dsa_user_get_drvinfo, .get_regs_len = dsa_user_get_regs_len, .get_regs = dsa_user_get_regs, .nway_reset = dsa_user_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = dsa_user_get_eeprom_len, .get_eeprom = dsa_user_get_eeprom, .set_eeprom = dsa_user_set_eeprom, .get_strings = dsa_user_get_strings, .get_ethtool_stats = dsa_user_get_ethtool_stats, .get_sset_count = dsa_user_get_sset_count, .get_eth_phy_stats = dsa_user_get_eth_phy_stats, .get_eth_mac_stats = dsa_user_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats, .get_rmon_stats = dsa_user_get_rmon_stats, .get_ts_stats = dsa_user_get_ts_stats, .set_wol = dsa_user_set_wol, .get_wol = dsa_user_get_wol, .set_eee = dsa_user_set_eee, .get_eee = dsa_user_get_eee, .get_link_ksettings = dsa_user_get_link_ksettings, .set_link_ksettings = dsa_user_set_link_ksettings, .get_pause_stats = dsa_user_get_pause_stats, .get_pauseparam = dsa_user_get_pauseparam, .set_pauseparam = dsa_user_set_pauseparam, .get_rxnfc = dsa_user_get_rxnfc, .set_rxnfc = dsa_user_set_rxnfc, .get_ts_info = dsa_user_get_ts_info, .self_test = dsa_user_net_selftest, .get_mm = dsa_user_get_mm, .set_mm = dsa_user_set_mm, .get_mm_stats = dsa_user_get_mm_stats, }; static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = { .ieee_setapp = dsa_user_dcbnl_ieee_setapp, .ieee_delapp = dsa_user_dcbnl_ieee_delapp, .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust, .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust, }; static void dsa_user_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (ds->ops->get_stats64) ds->ops->get_stats64(ds, dp->index, s); else dev_get_tstats64(dev, s); } static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct dsa_port *dp = dsa_user_to_port(ctx->dev); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_port *cpu_dp = dp->cpu_dp; path->dev = ctx->dev; path->type = DEV_PATH_DSA; path->dsa.proto = cpu_dp->tag_ops->proto; path->dsa.port = dp->index; ctx->dev = conduit; return 0; } static int dsa_user_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->port_hwtstamp_get) return -EOPNOTSUPP; return ds->ops->port_hwtstamp_get(ds, dp->index, cfg); } static int dsa_user_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; if (!ds->ops->port_hwtstamp_set) return -EOPNOTSUPP; return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack); } static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, .ndo_start_xmit = dsa_user_xmit, .ndo_change_rx_flags = dsa_user_change_rx_flags, .ndo_set_rx_mode = dsa_user_set_rx_mode, .ndo_set_mac_address = dsa_user_set_mac_address, .ndo_fdb_dump = dsa_user_fdb_dump, .ndo_eth_ioctl = dsa_user_ioctl, .ndo_get_iflink = dsa_user_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_user_netpoll_setup, .ndo_netpoll_cleanup = dsa_user_netpoll_cleanup, .ndo_poll_controller = dsa_user_poll_controller, #endif .ndo_setup_tc = dsa_user_setup_tc, .ndo_get_stats64 = dsa_user_get_stats64, .ndo_vlan_rx_add_vid = dsa_user_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, .ndo_hwtstamp_get = dsa_user_hwtstamp_get, .ndo_hwtstamp_set = dsa_user_hwtstamp_set, }; static const struct device_type dsa_type = { .name = "dsa", }; void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) { const struct dsa_port *dp = dsa_to_port(ds, port); if (dp->pl) phylink_mac_change(dp->pl, up); } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); static void dsa_user_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { struct dsa_port *dp = dsa_phylink_to_port(config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would * not be called if it was not. */ ds->ops->phylink_fixed_state(ds, dp->index, state); } /* user device setup *******************************************************/ static int dsa_user_phy_connect(struct net_device *user_dev, int addr, u32 flags) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_switch *ds = dp->ds; user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr); if (!user_dev->phydev) { netdev_err(user_dev, "no phy at %d\n", addr); return -ENODEV; } user_dev->phydev->dev_flags |= flags; return phylink_connect_phy(dp->pl, user_dev->phydev); } static int dsa_user_phy_setup(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); struct device_node *port_dn = dp->dn; struct dsa_switch *ds = dp->ds; u32 phy_flags = 0; int ret; dp->pl_config.dev = &user_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; /* The get_fixed_state callback takes precedence over polling the * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set * this if the switch provides such a callback. */ if (ds->ops->phylink_fixed_state) { dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state; dp->pl_config.poll_fixed_state = true; } ret = dsa_port_phylink_create(dp); if (ret) return ret; if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags); if (ret == -ENODEV && ds->user_mii_bus) { /* We could not connect to a designated PHY or SFP, so try to * use the switch internal MDIO bus instead */ ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags); } if (ret) { netdev_err(user_dev, "failed to connect to PHY: %pe\n", ERR_PTR(ret)); dsa_port_phylink_destroy(dp); } return ret; } void dsa_user_setup_tagger(struct net_device *user) { struct dsa_port *dp = dsa_user_to_port(user); struct net_device *conduit = dsa_port_to_conduit(dp); struct dsa_user_priv *p = netdev_priv(user); const struct dsa_port *cpu_dp = dp->cpu_dp; const struct dsa_switch *ds = dp->ds; user->needed_headroom = cpu_dp->tag_ops->needed_headroom; user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom; /* Try to save one extra realloc later in the TX path (in the conduit) * by also inheriting the conduit's needed headroom and tailroom. * The 8021q driver also does this. */ user->needed_headroom += conduit->needed_headroom; user->needed_tailroom += conduit->needed_tailroom; p->xmit = cpu_dp->tag_ops->xmit; user->features = conduit->vlan_features | NETIF_F_HW_TC; user->hw_features |= NETIF_F_HW_TC; if (user->needed_tailroom) user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; user->lltx = true; } int dsa_user_suspend(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_detach(user_dev); rtnl_lock(); phylink_stop(dp->pl); rtnl_unlock(); return 0; } int dsa_user_resume(struct net_device *user_dev) { struct dsa_port *dp = dsa_user_to_port(user_dev); if (!netif_running(user_dev)) return 0; netif_device_attach(user_dev); rtnl_lock(); phylink_start(dp->pl); rtnl_unlock(); return 0; } int dsa_user_create(struct dsa_port *port) { struct net_device *conduit = dsa_port_to_conduit(port); struct dsa_switch *ds = port->ds; struct net_device *user_dev; struct dsa_user_priv *p; const char *name; int assign_type; int ret; if (!ds->num_tx_queues) ds->num_tx_queues = 1; if (port->name) { name = port->name; assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; assign_type = NET_NAME_ENUM; } user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name, assign_type, ether_setup, ds->num_tx_queues, 1); if (user_dev == NULL) return -ENOMEM; user_dev->rtnl_link_ops = &dsa_link_ops; user_dev->ethtool_ops = &dsa_user_ethtool_ops; #if IS_ENABLED(CONFIG_DCB) user_dev->dcbnl_ops = &dsa_user_dcbnl_ops; #endif if (!is_zero_ether_addr(port->mac)) eth_hw_addr_set(user_dev, port->mac); else eth_hw_addr_inherit(user_dev, conduit); user_dev->priv_flags |= IFF_NO_QUEUE; if (dsa_switch_supports_uc_filtering(ds)) user_dev->priv_flags |= IFF_UNICAST_FLT; user_dev->netdev_ops = &dsa_user_netdev_ops; if (ds->ops->port_max_mtu) user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index); SET_NETDEV_DEVTYPE(user_dev, &dsa_type); SET_NETDEV_DEV(user_dev, port->ds->dev); SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port); user_dev->dev.of_node = port->dn; user_dev->vlan_features = conduit->vlan_features; p = netdev_priv(user_dev); user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; ret = gro_cells_init(&p->gcells, user_dev); if (ret) goto out_free; p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); port->user = user_dev; dsa_user_setup_tagger(user_dev); netif_carrier_off(user_dev); ret = dsa_user_phy_setup(user_dev); if (ret) { netdev_err(user_dev, "error %d setting up PHY for tree %d, switch %d, port %d\n", ret, ds->dst->index, ds->index, port->index); goto out_gcells; } rtnl_lock(); ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN); if (ret && ret != -EOPNOTSUPP) dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n", ret, ETH_DATA_LEN, port->index); ret = register_netdevice(user_dev); if (ret) { netdev_err(conduit, "error %d registering interface %s\n", ret, user_dev->name); rtnl_unlock(); goto out_phy; } if (IS_ENABLED(CONFIG_DCB)) { ret = dsa_user_dcbnl_init(user_dev); if (ret) { netdev_err(user_dev, "failed to initialize DCB: %pe\n", ERR_PTR(ret)); rtnl_unlock(); goto out_unregister; } } ret = netdev_upper_dev_link(conduit, user_dev, NULL); rtnl_unlock(); if (ret) goto out_unregister; return 0; out_unregister: unregister_netdev(user_dev); out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(p->dp); out_gcells: gro_cells_destroy(&p->gcells); out_free: free_netdev(user_dev); port->user = NULL; return ret; } void dsa_user_destroy(struct net_device *user_dev) { struct net_device *conduit = dsa_user_to_conduit(user_dev); struct dsa_port *dp = dsa_user_to_port(user_dev); struct dsa_user_priv *p = netdev_priv(user_dev); netif_carrier_off(user_dev); rtnl_lock(); netdev_upper_dev_unlink(conduit, user_dev); unregister_netdevice(user_dev); phylink_disconnect_phy(dp->pl); rtnl_unlock(); dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); free_netdev(user_dev); } int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit, struct netlink_ext_ack *extack) { struct net_device *old_conduit = dsa_user_to_conduit(dev); struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; struct net_device *upper; struct list_head *iter; int err; if (conduit == old_conduit) return 0; if (!ds->ops->port_change_conduit) { NL_SET_ERR_MSG_MOD(extack, "Driver does not support changing DSA conduit"); return -EOPNOTSUPP; } if (!netdev_uses_dsa(conduit)) { NL_SET_ERR_MSG_MOD(extack, "Interface not eligible as DSA conduit"); return -EOPNOTSUPP; } netdev_for_each_upper_dev_rcu(conduit, upper, iter) { if (dsa_user_dev_check(upper)) continue; if (netif_is_bridge_master(upper)) continue; NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers"); return -EOPNOTSUPP; } /* Since we allow live-changing the DSA conduit, plus we auto-open the * DSA conduit when the user port opens => we need to ensure that the * new DSA conduit is open too. */ if (dev->flags & IFF_UP) { err = dev_open(conduit, extack); if (err) return err; } netdev_upper_dev_unlink(old_conduit, dev); err = netdev_upper_dev_link(conduit, dev, extack); if (err) goto out_revert_old_conduit_unlink; err = dsa_port_change_conduit(dp, conduit, extack); if (err) goto out_revert_conduit_link; /* Update the MTU of the new CPU port through cross-chip notifiers */ err = dsa_user_change_mtu(dev, dev->mtu); if (err && err != -EOPNOTSUPP) { netdev_warn(dev, "nonfatal error updating MTU with new conduit: %pe\n", ERR_PTR(err)); } return 0; out_revert_conduit_link: netdev_upper_dev_unlink(conduit, dev); out_revert_old_conduit_unlink: netdev_upper_dev_link(old_conduit, dev, NULL); return err; } bool dsa_user_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_user_netdev_ops; } EXPORT_SYMBOL_GPL(dsa_user_dev_check); static int dsa_user_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return err; dp = dsa_user_to_port(dev); extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev)) { if (info->linking) { err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_port_lag_join(dp, info->upper_dev, info->upper_info, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_lag_leave(dp, info->upper_dev); err = NOTIFY_OK; } } else if (is_hsr_master(info->upper_dev)) { if (info->linking) { err = dsa_port_hsr_join(dp, info->upper_dev, extack); if (err == -EOPNOTSUPP) { NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported"); err = 0; } err = notifier_from_errno(err); } else { dsa_port_hsr_leave(dp, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp; if (!dsa_user_dev_check(dev)) return NOTIFY_DONE; dp = dsa_user_to_port(dev); if (netif_is_bridge_master(info->upper_dev) && !info->linking) dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot * meaningfully placed under a bridge yet */ return NOTIFY_DONE; } static int dsa_user_lag_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_changeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } /* Same as dsa_user_lag_changeupper() except that it calls * dsa_user_prechangeupper() */ static int dsa_user_lag_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct net_device *lower; struct list_head *iter; int err = NOTIFY_DONE; struct dsa_port *dp; if (!netif_is_lag_master(dev)) return err; netdev_for_each_lower_dev(dev, lower, iter) { if (!dsa_user_dev_check(lower)) continue; dp = dsa_user_to_port(lower); if (!dp->lag) /* Software LAG */ continue; err = dsa_user_prechangeupper(lower, info); if (notifier_to_errno(err)) break; } return err; } static int dsa_prevent_bridging_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *ext_ack; struct net_device *user, *br; struct dsa_port *dp; ext_ack = netdev_notifier_info_to_extack(&info->info); if (!is_vlan_dev(dev)) return NOTIFY_DONE; user = vlan_dev_real_dev(dev); if (!dsa_user_dev_check(user)) return NOTIFY_DONE; dp = dsa_user_to_port(user); br = dsa_port_bridge_dev_get(dp); if (!br) return NOTIFY_DONE; /* Deny enslaving a VLAN device into a VLAN-aware bridge */ if (br_vlan_enabled(br) && netif_is_bridge_master(info->upper_dev) && info->linking) { NL_SET_ERR_MSG_MOD(ext_ack, "Cannot make VLAN device join VLAN-aware bridge"); return notifier_from_errno(-EINVAL); } return NOTIFY_DONE; } static int dsa_user_check_8021q_upper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_user_to_port(dev); struct net_device *br = dsa_port_bridge_dev_get(dp); struct bridge_vlan_info br_info; struct netlink_ext_ack *extack; int err = NOTIFY_DONE; u16 vid; if (!br || !br_vlan_enabled(br)) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); vid = vlan_dev_vlan_id(info->upper_dev); /* br_vlan_get_info() returns -EINVAL or -ENOENT if the * device, respectively the VID is not found, returning * 0 means success, which is a failure for us here. */ err = br_vlan_get_info(br, vid, &br_info); if (err == 0) { NL_SET_ERR_MSG_MOD(extack, "This VLAN is already configured by the bridge"); return notifier_from_errno(-EBUSY); } return NOTIFY_DONE; } static int dsa_user_prechangeupper_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_switch *ds; struct dsa_port *dp; int err; if (!dsa_user_dev_check(dev)) return dsa_prevent_bridging_8021q_upper(dev, info); dp = dsa_user_to_port(dev); ds = dp->ds; if (ds->ops->port_prechangeupper) { err = ds->ops->port_prechangeupper(ds, dp->index, info); if (err) return notifier_from_errno(err); } if (is_vlan_dev(info->upper_dev)) return dsa_user_check_8021q_upper(dev, info); return NOTIFY_DONE; } /* To be eligible as a DSA conduit, a LAG must have all lower interfaces be * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of * switches in the same switch tree. */ static int dsa_lag_conduit_validate(struct net_device *lag_dev, struct netlink_ext_ack *extack) { struct net_device *lower1, *lower2; struct list_head *iter1, *iter2; netdev_for_each_lower_dev(lag_dev, lower1, iter1) { netdev_for_each_lower_dev(lag_dev, lower2, iter2) { if (!netdev_uses_dsa(lower1) || !netdev_uses_dsa(lower2)) { NL_SET_ERR_MSG_MOD(extack, "All LAG ports must be eligible as DSA conduits"); return notifier_from_errno(-EINVAL); } if (lower1 == lower2) continue; if (!dsa_port_tree_same(lower1->dsa_ptr, lower2->dsa_ptr)) { NL_SET_ERR_MSG_MOD(extack, "LAG contains DSA conduits of disjoint switch trees"); return notifier_from_errno(-EINVAL); } } } return NOTIFY_DONE; } static int dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); if (!netdev_uses_dsa(conduit)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; /* Allow DSA switch uppers */ if (dsa_user_dev_check(info->upper_dev)) return NOTIFY_DONE; /* Allow bridge uppers of DSA conduits, subject to further * restrictions in dsa_bridge_prechangelower_sanity_check() */ if (netif_is_bridge_master(info->upper_dev)) return NOTIFY_DONE; /* Allow LAG uppers, subject to further restrictions in * dsa_lag_conduit_prechangelower_sanity_check() */ if (netif_is_lag_master(info->upper_dev)) return dsa_lag_conduit_validate(info->upper_dev, extack); NL_SET_ERR_MSG_MOD(extack, "DSA conduit cannot join unknown upper interfaces"); return notifier_from_errno(-EBUSY); } static int dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); struct net_device *lag_dev = info->upper_dev; struct net_device *lower; struct list_head *iter; if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; if (!netdev_uses_dsa(dev)) { NL_SET_ERR_MSG(extack, "Only DSA conduits can join a LAG DSA conduit"); return notifier_from_errno(-EINVAL); } netdev_for_each_lower_dev(lag_dev, lower, iter) { if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) { NL_SET_ERR_MSG(extack, "Interface is DSA conduit for a different switch tree than this LAG"); return notifier_from_errno(-EINVAL); } break; } return NOTIFY_DONE; } /* Don't allow bridging of DSA conduits, since the bridge layer rx_handler * prevents the DSA fake ethertype handler to be invoked, so we don't get the * chance to strip off and parse the DSA switch tag protocol header (the bridge * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these * frames). * The only case where that would not be an issue is when bridging can already * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev * port, and is bridged only with other ports from the same hardware device. */ static int dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, struct netdev_notifier_changeupper_info *info) { struct net_device *br = info->upper_dev; struct netlink_ext_ack *extack; struct net_device *lower; struct list_head *iter; if (!netif_is_bridge_master(br)) return NOTIFY_DONE; if (!info->linking) return NOTIFY_DONE; extack = netdev_notifier_info_to_extack(&info->info); netdev_for_each_lower_dev(br, lower, iter) { if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower)) continue; if (!netdev_port_same_parent_id(lower, new_lower)) { NL_SET_ERR_MSG(extack, "Cannot do software bridging with a DSA conduit"); return notifier_from_errno(-EINVAL); } } return NOTIFY_DONE; } static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst, struct net_device *lag_dev) { struct net_device *new_conduit = dsa_tree_find_first_conduit(dst); struct dsa_port *dp; int err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, new_conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", new_conduit->name, ERR_PTR(err)); } } } static int dsa_conduit_lag_join(struct net_device *conduit, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack) { struct dsa_port *cpu_dp = conduit->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *dp; int err; err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack); if (err) return err; dsa_tree_for_each_user_port(dp, dst) { if (dsa_port_to_conduit(dp) != conduit) continue; err = dsa_user_change_conduit(dp->user, lag_dev, extack); if (err) goto restore; } return 0; restore: dsa_tree_for_each_user_port_continue_reverse(dp, dst) { if (dsa_port_to_conduit(dp) != lag_dev) continue; err = dsa_user_change_conduit(dp->user, conduit, NULL); if (err) { netdev_err(dp->user, "failed to restore conduit to %s: %pe\n", conduit->name, ERR_PTR(err)); } } dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); return err; } static void dsa_conduit_lag_leave(struct net_device *conduit, struct net_device *lag_dev) { struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_port *new_cpu_dp = NULL; struct net_device *lower; struct list_head *iter; netdev_for_each_lower_dev(lag_dev, lower, iter) { if (netdev_uses_dsa(lower)) { new_cpu_dp = lower->dsa_ptr; break; } } if (new_cpu_dp) { /* Update the CPU port of the user ports still under the LAG * so that dsa_port_to_conduit() continues to work properly */ dsa_tree_for_each_user_port(dp, dst) if (dsa_port_to_conduit(dp) == lag_dev) dp->cpu_dp = new_cpu_dp; /* Update the index of the virtual CPU port to match the lowest * physical CPU port */ lag_dev->dsa_ptr = new_cpu_dp; wmb(); } else { /* If the LAG DSA conduit has no ports left, migrate back all * user ports to the first physical CPU port */ dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev); } /* This DSA conduit has left its LAG in any case, so let * the CPU port leave the hardware LAG as well */ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr); } static int dsa_conduit_changeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct netlink_ext_ack *extack; int err = NOTIFY_DONE; if (!netdev_uses_dsa(dev)) return err; extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_lag_master(info->upper_dev)) { if (info->linking) { err = dsa_conduit_lag_join(dev, info->upper_dev, info->upper_info, extack); err = notifier_from_errno(err); } else { dsa_conduit_lag_leave(dev, info->upper_dev); err = NOTIFY_OK; } } return err; } static int dsa_user_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_PRECHANGEUPPER: { struct netdev_notifier_changeupper_info *info = ptr; int err; err = dsa_user_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_conduit_prechangeupper_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_lag_conduit_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_bridge_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; err = dsa_user_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_prechangeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGEUPPER: { int err; err = dsa_user_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_user_lag_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; err = dsa_conduit_changeupper(dev, ptr); if (notifier_to_errno(err)) return err; break; } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; int err = 0; if (dsa_user_dev_check(dev)) { dp = dsa_user_to_port(dev); err = dsa_port_lag_change(dp, info->lower_state_info); } /* Mirror LAG port events on DSA conduits that are in * a LAG towards their respective switch CPU ports */ if (netdev_uses_dsa(dev)) { dp = dev->dsa_ptr; err = dsa_port_lag_change(dp, info->lower_state_info); } return notifier_from_errno(err); } case NETDEV_CHANGE: case NETDEV_UP: { /* Track state of conduit port. * DSA driver may require the conduit port (and indirectly * the tagger) to be available for some special operation. */ if (netdev_uses_dsa(dev)) { struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->ds->dst; /* Track when the conduit port is UP */ dsa_tree_conduit_oper_state_change(dst, dev, netif_oper_up(dev)); /* Track when the conduit port is ready and can accept * packet. * NETDEV_UP event is not enough to flag a port as ready. * We also have to wait for linkwatch_do_dev to dev_activate * and emit a NETDEV_CHANGE event. * We check if a conduit port is ready by checking if the dev * have a qdisc assigned and is not noop. */ dsa_tree_conduit_admin_state_change(dst, dev, !qdisc_tx_is_noop(dev)); return NOTIFY_OK; } return NOTIFY_DONE; } case NETDEV_GOING_DOWN: { struct dsa_port *dp, *cpu_dp; struct dsa_switch_tree *dst; LIST_HEAD(close_list); if (!netdev_uses_dsa(dev)) return NOTIFY_DONE; cpu_dp = dev->dsa_ptr; dst = cpu_dp->ds->dst; dsa_tree_conduit_admin_state_change(dst, dev, false); list_for_each_entry(dp, &dst->ports, list) { if (!dsa_port_is_user(dp)) continue; if (dp->cpu_dp != cpu_dp) continue; list_add(&dp->user->close_list, &close_list); } dev_close_many(&close_list, true); return NOTIFY_OK; } default: break; } return NOTIFY_DONE; } static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { struct switchdev_notifier_fdb_info info = {}; info.addr = switchdev_work->addr; info.vid = switchdev_work->vid; info.offloaded = true; call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, switchdev_work->orig_dev, &info.info, NULL); } static void dsa_user_switchdev_event_work(struct work_struct *work) { struct dsa_switchdev_event_work *switchdev_work = container_of(work, struct dsa_switchdev_event_work, work); const unsigned char *addr = switchdev_work->addr; struct net_device *dev = switchdev_work->dev; u16 vid = switchdev_work->vid; struct dsa_switch *ds; struct dsa_port *dp; int err; dp = dsa_user_to_port(dev); ds = dp->ds; switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_add(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_add(dp, addr, vid); else err = dsa_port_fdb_add(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to add %pM vid %d to fdb: %d\n", dp->index, addr, vid, err); break; } dsa_fdb_offload_notify(switchdev_work); break; case SWITCHDEV_FDB_DEL_TO_DEVICE: if (switchdev_work->host_addr) err = dsa_port_bridge_host_fdb_del(dp, addr, vid); else if (dp->lag) err = dsa_port_lag_fdb_del(dp, addr, vid); else err = dsa_port_fdb_del(dp, addr, vid); if (err) { dev_err(ds->dev, "port %d failed to delete %pM vid %d from fdb: %d\n", dp->index, addr, vid, err); } break; } kfree(switchdev_work); } static bool dsa_foreign_dev_check(const struct net_device *dev, const struct net_device *foreign_dev) { const struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch_tree *dst = dp->ds->dst; if (netif_is_bridge_master(foreign_dev)) return !dsa_tree_offloads_bridge_dev(dst, foreign_dev); if (netif_is_bridge_port(foreign_dev)) return !dsa_tree_offloads_bridge_port(dst, foreign_dev); /* Everything else is foreign */ return true; } static int dsa_user_fdb_event(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, const struct switchdev_notifier_fdb_info *fdb_info) { struct dsa_switchdev_event_work *switchdev_work; struct dsa_port *dp = dsa_user_to_port(dev); bool host_addr = fdb_info->is_local; struct dsa_switch *ds = dp->ds; if (ctx && ctx != dp) return 0; if (!dp->bridge) return 0; if (switchdev_fdb_is_dynamically_learned(fdb_info)) { if (dsa_port_offloads_bridge_port(dp, orig_dev)) return 0; /* FDB entries learned by the software bridge or by foreign * bridge ports should be installed as host addresses only if * the driver requests assisted learning. */ if (!ds->assisted_learning_on_cpu_port) return 0; } /* Also treat FDB entries on foreign interfaces bridged with us as host * addresses. */ if (dsa_foreign_dev_check(dev, orig_dev)) host_addr = true; /* Check early that we're not doing work in vain. * Host addresses on LAG ports still require regular FDB ops, * since the CPU port isn't in a LAG. */ if (dp->lag && !host_addr) { if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del) return -EOPNOTSUPP; } else { if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) return -EOPNOTSUPP; } switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); if (!switchdev_work) return -ENOMEM; netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", orig_dev->name, fdb_info->addr, fdb_info->vid, host_addr ? " as host address" : ""); INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work); switchdev_work->event = event; switchdev_work->dev = dev; switchdev_work->orig_dev = orig_dev; ether_addr_copy(switchdev_work->addr, fdb_info->addr); switchdev_work->vid = fdb_info->vid; switchdev_work->host_addr = host_addr; dsa_schedule_work(&switchdev_work->work); return 0; } /* Called under rcu_read_lock() */ static int dsa_user_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: err = switchdev_handle_fdb_event_to_device(dev, event, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_fdb_event); return notifier_from_errno(err); default: return NOTIFY_DONE; } return NOTIFY_OK; } static int dsa_user_switchdev_blocking_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); int err; switch (event) { case SWITCHDEV_PORT_OBJ_ADD: err = switchdev_handle_port_obj_add_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_add); return notifier_from_errno(err); case SWITCHDEV_PORT_OBJ_DEL: err = switchdev_handle_port_obj_del_foreign(dev, ptr, dsa_user_dev_check, dsa_foreign_dev_check, dsa_user_port_obj_del); return notifier_from_errno(err); case SWITCHDEV_PORT_ATTR_SET: err = switchdev_handle_port_attr_set(dev, ptr, dsa_user_dev_check, dsa_user_port_attr_set); return notifier_from_errno(err); } return NOTIFY_DONE; } static struct notifier_block dsa_user_nb __read_mostly = { .notifier_call = dsa_user_netdevice_event, }; struct notifier_block dsa_user_switchdev_notifier = { .notifier_call = dsa_user_switchdev_event, }; struct notifier_block dsa_user_switchdev_blocking_notifier = { .notifier_call = dsa_user_switchdev_blocking_event, }; int dsa_user_register_notifier(void) { struct notifier_block *nb; int err; err = register_netdevice_notifier(&dsa_user_nb); if (err) return err; err = register_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) goto err_switchdev_nb; nb = &dsa_user_switchdev_blocking_notifier; err = register_switchdev_blocking_notifier(nb); if (err) goto err_switchdev_blocking_nb; return 0; err_switchdev_blocking_nb: unregister_switchdev_notifier(&dsa_user_switchdev_notifier); err_switchdev_nb: unregister_netdevice_notifier(&dsa_user_nb); return err; } void dsa_user_unregister_notifier(void) { struct notifier_block *nb; int err; nb = &dsa_user_switchdev_blocking_notifier; err = unregister_switchdev_blocking_notifier(nb); if (err) pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err); err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier); if (err) pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err); err = unregister_netdevice_notifier(&dsa_user_nb); if (err) pr_err("DSA: failed to unregister user notifier (%d)\n", err); }
260 249 5 23 23 21 21 9 12 21 228 14 21 201 20 7 254 16 16 70 148 31 79 148 148 231 228 2 14 104 143 143 227 14 232 228 14 233 21 232 104 143 143 228 14 232 12 4 8 12 12 12 232 280 37 254 279 280 13 295 280 100 296 54 54 54 54 234 234 234 231 5 171 216 227 14 230 232 232 232 232 171 218 171 218 10 2 2 313 314 314 314 314 314 21 21 21 21 21 12 280 280 280 279 269 3 12 15 264 21 21 21 21 6 12 12 12 3 21 9 12 21 296 258 296 34 258 258 309 277 74 31 296 295 296 296 296 296 258 74 3 3 2 296 168 173 173 297 155 220 297 293 27 143 280 3 16 138 276 297 297 296 296 296 296 296 76 76 3 74 77 77 1 76 295 83 83 83 83 295 282 83 17 295 256 83 69 52 24 45 195 19 63 42 30 78 11 77 17 61 74 79 247 195 79 79 265 265 265 265 265 265 265 265 265 265 265 3 265 265 265 265 265 265 265 265 265 265 265 265 265 265 265 265 265 265 265 185 3 51 259 2 259 34 280 16 28 280 184 184 143 143 184 45 71 231 231 231 45 195 158 106 72 39 247 126 280 280 280 280 264 247 247 52 2 54 280 280 280 73 122 280 184 122 1 1 10 10 6 4 9 9 2 3 4 7 7 77 77 74 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 // SPDX-License-Identifier: GPL-2.0-or-later /* * VMA-specific functions. */ #include "vma_internal.h" #include "vma.h" struct mmap_state { struct mm_struct *mm; struct vma_iterator *vmi; unsigned long addr; unsigned long end; pgoff_t pgoff; unsigned long pglen; unsigned long flags; struct file *file; pgprot_t page_prot; /* User-defined fields, perhaps updated by .mmap_prepare(). */ const struct vm_operations_struct *vm_ops; void *vm_private_data; unsigned long charged; struct vm_area_struct *prev; struct vm_area_struct *next; /* Unmapping state. */ struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .flags = flags_, \ .file = file_, \ .page_prot = vm_get_page_prot(flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ struct vma_merge_struct name = { \ .mm = (map_)->mm, \ .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ .flags = (map_)->flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ .middle = vma_, \ .next = (vma_) ? NULL : (map_)->next, \ .state = VMA_MERGE_START, \ } /* * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain * more than one anon_vma_chain connecting it to more than one anon_vma. A merge * would mean a wider range of folios sharing the root anon_vma lock, and thus * potential lock contention, we do not wish to encourage merging such that this * scales to a problem. */ static bool vma_had_uncowed_parents(struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from * parents. This can improve scalability caused by anon_vma lock. */ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); } static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; /* * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) return false; if (vma->vm_file != vmg->file) return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) return false; if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) return false; return true; } static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev; struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */ struct anon_vma *tgt_anon = tgt->anon_vma; struct anon_vma *src_anon = vmg->anon_vma; /* * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we * will remove the existing VMA's anon_vma's so there's no scalability * concerns. */ VM_WARN_ON(src && src_anon != src->anon_vma); /* Case 1 - we will dup_anon_vma() from src into tgt. */ if (!tgt_anon && src_anon) return !vma_had_uncowed_parents(src); /* Case 2 - we will simply use tgt's anon_vma. */ if (tgt_anon && !src_anon) return !vma_had_uncowed_parents(tgt); /* Case 3 - the anon_vma's are already shared. */ return src_anon == tgt_anon; } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked * @vmg: The merge state that will be used to determine adjustment and VMA * removal. */ static void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; struct vm_area_struct **remove = &vp->remove; memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; if (vmg && vmg->__remove_middle) { *remove = vmg->middle; remove = &vp->remove2; } if (vmg && vmg->__remove_next) *remove = vmg->next; if (vmg && vmg->__adjust_middle_start) adjust = vmg->middle; else if (vmg && vmg->__adjust_next_start) adjust = vmg->next; else adjust = NULL; vp->adj_next = adjust; if (!vp->anon_vma && adjust) vp->anon_vma = adjust->anon_vma; VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && vp->anon_vma != adjust->anon_vma); vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; if (vmg && vmg->skip_vma_uprobe) vp->skip_vma_uprobe = true; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. * * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vma_merge_struct *vmg) { pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && is_mergeable_anon_vma(vmg, /* merge_next = */ true)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } return false; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && is_mergeable_anon_vma(vmg, /* merge_next = */ false)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } return false; } static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. * * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the * vma must be removed from the anon_vma's interval trees using * anon_vma_interval_tree_pre_update_vma(). * * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); } static void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ static void vma_prepare(struct vma_prepare *vp) { if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); if (vp->adj_next) uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, vp->adj_next->vm_end); i_mmap_lock_write(vp->mapping); if (vp->insert && vp->insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ __vma_link_file(vp->insert, vp->insert->vm_file->f_mapping); } } if (vp->anon_vma) { anon_vma_lock_write(vp->anon_vma); anon_vma_interval_tree_pre_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_pre_update_vma(vp->adj_next); } if (vp->file) { flush_dcache_mmap_lock(vp->mapping); vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); if (vp->adj_next) vma_interval_tree_remove(vp->adj_next, &vp->mapping->i_mmap); } } /* * vma_complete- Helper function for handling the unlocking after altering VMAs, * or for inserting a VMA. * * @vp: The vma_prepare struct * @vmi: The vma iterator * @mm: The mm_struct */ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm) { if (vp->file) { if (vp->adj_next) vma_interval_tree_insert(vp->adj_next, &vp->mapping->i_mmap); vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); flush_dcache_mmap_unlock(vp->mapping); } if (vp->remove && vp->file) { __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ vma_iter_store_new(vmi, vp->insert); mm->map_count++; } if (vp->anon_vma) { anon_vma_interval_tree_post_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_post_update_vma(vp->adj_next); anon_vma_unlock_write(vp->anon_vma); } if (vp->file) { i_mmap_unlock_write(vp->mapping); if (!vp->skip_vma_uprobe) { uprobe_mmap(vp->vma); if (vp->adj_next) uprobe_mmap(vp->adj_next); } } if (vp->remove) { again: vma_mark_detached(vp->remove); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); fput(vp->file); } if (vp->remove->anon_vma) anon_vma_merge(vp->vma, vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); vm_area_free(vp->remove); /* * In mprotect's case 6 (see comments on vma_merge), * we are removing both mid and next vmas */ if (vp->remove2) { vp->remove = vp->remove2; vp->remove2 = NULL; goto again; } } if (vp->insert && vp->file) uprobe_mmap(vp->insert); } /* * init_vma_prep() - Initializer wrapper for vma_prepare struct * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked */ static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { init_multi_vma_prep(vp, vma, NULL); } /* * Can the proposed VMA be merged with the left (previous) VMA taking into * account the start position of the proposed range. */ static bool can_vma_merge_left(struct vma_merge_struct *vmg) { return vmg->prev && vmg->prev->vm_end == vmg->start && can_vma_merge_after(vmg); } /* * Can the proposed VMA be merged with the right (next) VMA taking into * account the end position of the proposed range. * * In addition, if we can merge with the left VMA, ensure that left and right * anon_vma's are also compatible. */ static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { struct vm_area_struct *next = vmg->next; struct vm_area_struct *prev; if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) return true; /* * If we can merge with prev (left) and next (right), indicating that * each VMA's anon_vma is compatible with the proposed anon_vma, this * does not mean prev and next are compatible with EACH OTHER. * * We therefore check this in addition to mergeability to either side. */ prev = vmg->prev; return !prev->anon_vma || !next->anon_vma || prev->anon_vma == next->anon_vma; } /* * Close a vm structure and free it. */ void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); } /* * Get rid of page table information in the indicated region. * * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, /* mm_wr_locked = */ true); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); } /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ static __must_check int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; int err; WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) return err; } new = vm_area_dup(vma); if (!new) return -ENOMEM; if (new_below) { new->vm_end = addr; } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = -ENOMEM; vma_iter_config(vmi, new->vm_start, new->vm_end); if (vma_iter_prealloc(vmi, new)) goto out_free_vma; err = vma_dup_policy(vma, new); if (err) goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; if (new->vm_file) get_file(new->vm_file); if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); vma_start_write(vma); vma_start_write(new); init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); /* * Get rid of huge pages and shared page tables straddling the split * boundary. */ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); if (is_vm_hugetlb_page(vma)) hugetlb_split(vma, addr); if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; } else { vma->vm_end = addr; } /* vma_complete stores the new vma */ vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); /* Success. */ if (new_below) vma_next(vmi); else vma_prev(vmi); return 0; out_free_mpol: mpol_put(vma_policy(new)); out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); return err; } /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); } /* * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the * instance that the destination VMA has no anon_vma but the source does. * * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. * * Returns: 0 on success. */ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* * There are three cases to consider for correctly propagating * anon_vma's on merge. * * The first is trivial - neither VMA has anon_vma, we need not do * anything. * * The second where both have anon_vma is also a no-op, as they must * then be the same, so there is simply nothing to copy. * * Here we cover the third - if the destination VMA has no anon_vma, * that is it is unfaulted, we need to ensure that the newly merged * range is referenced by the anon_vma's of the source. */ if (src->anon_vma && !dst->anon_vma) { int ret; vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); if (ret) return ret; *dup = dst; } return 0; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mt_validate(&mm->mm_mt); for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; #endif unsigned long vmi_start, vmi_end; bool warn = 0; vmi_start = vma_iter_addr(&vmi); vmi_end = vma_iter_end(&vmi); if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) warn = 1; if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) warn = 1; if (warn) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma); pr_emerg("tree range: %px start %lx end %lx\n", vma, vmi_start, vmi_end - 1); vma_iter_dump_tree(&vmi); } #ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } #endif /* Check for a infinite loop */ if (++i > mm->map_count + 10) { i = -1; break; } } if (i != mm->map_count) { pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * Based on the vmg flag indicating whether we need to adjust the vm_start field * for the middle or next VMA, we calculate what the range of the newly adjusted * VMA ought to be, and set the VMA's range accordingly. */ static void vmg_adjust_set_range(struct vma_merge_struct *vmg) { struct vm_area_struct *adjust; pgoff_t pgoff; if (vmg->__adjust_middle_start) { adjust = vmg->middle; pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); } else if (vmg->__adjust_next_start) { adjust = vmg->next; pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); } else { return; } vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); } /* * Actually perform the VMA merge operation. * * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not * modify any VMAs or cause inconsistent state should an OOM condition arise. * * Returns 0 on success, or an error value on failure. */ static int commit_merge(struct vma_merge_struct *vmg) { struct vm_area_struct *vma; struct vma_prepare vp; if (vmg->__adjust_next_start) { /* We manipulate middle and adjust next, which is the target. */ vma = vmg->middle; vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); } else { vma = vmg->target; /* Note: vma iterator must be pointing to 'start'. */ vma_iter_config(vmg->vmi, vmg->start, vmg->end); } init_multi_vma_prep(&vp, vma, vmg); /* * If vmg->give_up_on_oom is set, we're safe, because we don't actually * manipulate any VMAs until we succeed at preallocation. * * Past this point, we will not return an error. */ if (vma_iter_prealloc(vmg->vmi, vma)) return -ENOMEM; vma_prepare(&vp); /* * THP pages may need to do additional splits if we increase * middle->vm_start. */ vma_adjust_trans_huge(vma, vmg->start, vmg->end, vmg->__adjust_middle_start ? vmg->middle : NULL); vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); vmg_adjust_set_range(vmg); vma_iter_store_overwrite(vmg->vmi, vmg->target); vma_complete(&vp, vmg->vmi, vma->vm_mm); return 0; } /* We can only remove VMAs when merging if they do not have a close hook. */ static bool can_merge_remove_vma(struct vm_area_struct *vma) { return !vma->vm_ops || !vma->vm_ops->close; } /* * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its * attributes modified. * * @vmg: Describes the modifications being made to a VMA and associated * metadata. * * When the attributes of a range within a VMA change, then it might be possible * for immediately adjacent VMAs to be merged into that VMA due to having * identical properties. * * This function checks for the existence of any such mergeable VMAs and updates * the maple tree describing the @vmg->middle->vm_mm address space to account * for this, as well as any VMAs shrunk/expanded/deleted as a result of this * merge. * * As part of this operation, if a merge occurs, the @vmg object will have its * vma, start, end, and pgoff fields modified to execute the merge. Subsequent * calls to this function should reset these fields. * * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: * - The caller must assign the VMA to be modifed to @vmg->middle. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end). */ static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next; struct vm_area_struct *anon_dup = NULL; unsigned long start = vmg->start; unsigned long end = vmg->end; bool left_side = middle && start == middle->vm_start; bool right_side = middle && end == middle->vm_end; int err = 0; bool merge_left, merge_right, merge_both; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); VM_WARN_ON_VMG(start >= end, vmg); /* * If middle == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ VM_WARN_ON_VMG(middle && ((middle != prev && vmg->start != middle->vm_start) || vmg->end > middle->vm_end), vmg); /* The vmi must be positioned within vmg->middle. */ VM_WARN_ON_VMG(middle && !(vma_iter_addr(vmg->vmi) >= middle->vm_start && vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); vmg->state = VMA_MERGE_NOMERGE; /* * If a special mapping or if the range being modified is neither at the * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) merge_left = can_vma_merge_left(vmg); else merge_left = false; if (right_side) { next = vmg->next = vma_iter_next_range(vmg->vmi); vma_iter_prev_range(vmg->vmi); merge_right = can_vma_merge_right(vmg, merge_left); } else { merge_right = false; next = NULL; } if (merge_left) /* If merging prev, position iterator there. */ vma_prev(vmg->vmi); else if (!merge_right) /* If we have nothing to merge, abort. */ return NULL; merge_both = merge_left && merge_right; /* If we span the entire VMA, a merge implies it will be deleted. */ vmg->__remove_middle = left_side && right_side; /* * If we need to remove middle in its entirety but are unable to do so, * we have no sensible recourse but to abort the merge. */ if (vmg->__remove_middle && !can_merge_remove_vma(middle)) return NULL; /* * If we merge both VMAs, then next is also deleted. This implies * merge_will_delete_vma also. */ vmg->__remove_next = merge_both; /* * If we cannot delete next, then we can reduce the operation to merging * prev and middle (thereby deleting middle). */ if (vmg->__remove_next && !can_merge_remove_vma(next)) { vmg->__remove_next = false; merge_right = false; merge_both = false; } /* No matter what happens, we will be adjusting middle. */ vma_start_write(middle); if (merge_right) { vma_start_write(next); vmg->target = next; } if (merge_left) { vma_start_write(prev); vmg->target = prev; } if (merge_both) { /* * |<-------------------->| * |-------********-------| * prev middle next * extend delete delete */ vmg->start = prev->vm_start; vmg->end = next->vm_end; vmg->pgoff = prev->vm_pgoff; /* * We already ensured anon_vma compatibility above, so now it's * simply a case of, if prev has no anon_vma object, which of * next or middle contains the anon_vma we must duplicate. */ err = dup_anon_vma(prev, next->anon_vma ? next : middle, &anon_dup); } else if (merge_left) { /* * |<------------>| OR * |<----------------->| * |-------************* * prev middle * extend shrink/delete */ vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; if (!vmg->__remove_middle) vmg->__adjust_middle_start = true; err = dup_anon_vma(prev, middle, &anon_dup); } else { /* merge_right */ /* * |<------------->| OR * |<----------------->| * *************-------| * middle next * shrink/delete extend */ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be middle. */ VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); if (vmg->__remove_middle) { vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { /* We shrink middle and expand next. */ vmg->__adjust_next_start = true; vmg->start = middle->vm_start; vmg->end = start; vmg->pgoff = middle->vm_pgoff; } err = dup_anon_vma(next, middle, &anon_dup); } if (err || commit_merge(vmg)) goto abort; khugepaged_enter_vma(vmg->target, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); if (anon_dup) unlink_anon_vmas(anon_dup); /* * This means we have failed to clone anon_vma's correctly, but no * actual changes to VMAs have occurred, so no harm no foul - if the * user doesn't want this reported and instead just wants to give up on * the merge, allow it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } /* * vma_merge_new_range - Attempt to merge a new VMA into address space * * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end * (exclusive), which we try to merge with any adjacent VMAs if possible. * * We are about to add a VMA to the address space starting at @vmg->start and * ending at @vmg->end. There are three different possible scenarios: * * 1. There is a VMA with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) either before or after it - * EXPAND that VMA: * * Proposed: |-----| or |-----| * Existing: |----| |----| * * 2. There are VMAs with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - * EXPAND the former and REMOVE the latter: * * Proposed: |-----| * Existing: |----| |----| * * 3. There are no VMAs immediately adjacent to the proposed new VMA or those * VMAs do not have identical attributes - NO MERGE POSSIBLE. * * In instances where we can merge, this function returns the expanded VMA which * will have its range adjusted accordingly and the underlying maple tree also * adjusted. * * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer * to the VMA we expanded. * * This function adjusts @vmg to provide @vmg->next if not already specified, * and adjusts [@vmg->start, @vmg->end) to span the expanded range. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - The caller must have determined that [@vmg->start, @vmg->end) is empty, other than VMAs that will be unmapped should the operation succeed. * - The caller must have specified the previous vma in @vmg->prev. * - The caller must have specified the next vma in @vmg->next. * - The caller must have positioned the vmi at or before the gap. */ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) { struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next = vmg->next; unsigned long end = vmg->end; bool can_merge_left, can_merge_right; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(vmg->middle, vmg); /* vmi must point at or before the gap. */ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; vmg->middle = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; vmg->middle = prev; vmg->pgoff = prev->vm_pgoff; /* * If this merge would result in removal of the next VMA but we * are not permitted to do so, reduce the operation to merging * prev and vma. */ if (can_merge_right && !can_merge_remove_vma(next)) vmg->end = end; /* In expand-only case we are already positioned at prev. */ if (!vmg->just_expand) { /* Equivalent to going to the previous range. */ vma_prev(vmg->vmi); } } /* * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ if (vmg->middle && !vma_expand(vmg)) { khugepaged_enter_vma(vmg->middle, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->middle; } return NULL; } /* * vma_expand - Expand an existing VMA * * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. * Will expand over vmg->next if it's different from vmg->middle and vmg->end == * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock. * - The caller must have set @vmg->middle and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; bool remove_next = false; struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *next = vmg->next; mmap_assert_write_locked(vmg->mm); vma_start_write(middle); if (next && (middle != next) && (vmg->end == next->vm_end)) { int ret; remove_next = true; /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); /* * In this case we don't report OOM, so vmg->give_up_on_mm is * safe. */ ret = dup_anon_vma(middle, next, &anon_dup); if (ret) return ret; } /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && next != middle && vmg->end > next->vm_start, vmg); /* Only handles expanding */ VM_WARN_ON_VMG(middle->vm_start < vmg->start || middle->vm_end > vmg->end, vmg); vmg->target = middle; if (remove_next) vmg->__remove_next = true; if (commit_merge(vmg)) goto nomem; return 0; nomem: if (anon_dup) unlink_anon_vmas(anon_dup); /* * If the user requests that we just give upon OOM, we are safe to do so * here, as commit merge provides this contract to us. Nothing has been * changed - no harm no foul, just don't report it. */ if (!vmg->give_up_on_oom) vmg->state = VMA_MERGE_ERROR_NOMEM; return -ENOMEM; } /* * vma_shrink() - Reduce an existing VMAs memory area * @vmi: The vma iterator * @vma: The VMA to modify * @start: The new start * @end: The new end * * Returns: 0 on success, -ENOMEM otherwise */ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { struct vma_prepare vp; WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); if (vma->vm_start < start) vma_iter_config(vmi, vma->vm_start, start); else vma_iter_config(vmi, end, vma->vm_end); if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; vma_start_write(vma); init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, NULL); vma_iter_clear(vmi); vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); return 0; } static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; if (!vms->clear_ptes) /* Nothing to do */ return; /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count, mm_wr_locked); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); vms->clear_ptes = false; } static void vms_clean_up_area(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, true); mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_close(vma); } /* * vms_complete_munmap_vmas() - Finish the munmap() operation * @vms: The vma munmap struct * @mas_detach: The maple state of the detached vmas * * This updates the mm_struct, unmaps the region, frees the resources * used for the munmap() and may downgrade the lock - if requested. Everything * needed to be done once the vma maple tree is updated. */ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; struct mm_struct *mm; mm = current->mm; mm->map_count -= vms->vma_count; mm->locked_vm -= vms->locked_vm; if (vms->unlock) mmap_write_downgrade(mm); if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, !vms->unlock); /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); /* Stat accounting */ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); /* Paranoid bookkeeping */ VM_WARN_ON(vms->exec_vm > mm->exec_vm); VM_WARN_ON(vms->stack_vm > mm->stack_vm); VM_WARN_ON(vms->data_vm > mm->data_vm); mm->exec_vm -= vms->exec_vm; mm->stack_vm -= vms->stack_vm; mm->data_vm -= vms->data_vm; /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); if (vms->unlock) mmap_read_unlock(mm); __mt_destroy(mas_detach->tree); } /* * reattach_vmas() - Undo any munmap work and free resources * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas and free up the maple tree used to track the vmas. */ static void reattach_vmas(struct ma_state *mas_detach) { struct vm_area_struct *vma; mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_mark_attached(vma); __mt_destroy(mas_detach->tree); } /* * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree * for removal at a later date. Handles splitting first and last if necessary * and marking the vmas as isolated. * * @vms: The vma munmap struct * @mas_detach: The maple state tracking the detached tree * * Return: 0 on success, error otherwise */ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *next = NULL; int error; /* * If we need to split any vma, do it now to save pain later. * Does it split the first one? */ if (vms->start > vms->vma->vm_start) { /* * Make sure that map_count on return from munmap() will * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && vms->vma->vm_mm->map_count >= sysctl_max_map_count) { error = -ENOMEM; goto map_count_exceeded; } /* Don't bother splitting the VMA if we can't unmap it anyway */ if (!can_modify_vma(vms->vma)) { error = -EPERM; goto start_split_failed; } error = __split_vma(vms->vmi, vms->vma, vms->start, 1); if (error) goto start_split_failed; } vms->prev = vma_prev(vms->vmi); if (vms->prev) vms->unmap_start = vms->prev->vm_end; /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; if (!can_modify_vma(next)) { error = -EPERM; goto modify_vma_failed; } /* Does it split the end? */ if (next->vm_end > vms->end) { error = __split_vma(vms->vmi, next, vms->end, 0); if (error) goto end_split_failed; } vma_start_write(next); mas_set(mas_detach, vms->vma_count++); error = mas_store_gfp(mas_detach, next, GFP_KERNEL); if (error) goto munmap_gather_failed; vma_mark_detached(next); nrpages = vma_pages(next); vms->nr_pages += nrpages; if (next->vm_flags & VM_LOCKED) vms->locked_vm += nrpages; if (next->vm_flags & VM_ACCOUNT) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; else if (is_data_mapping(next->vm_flags)) vms->data_vm += nrpages; if (vms->uf) { /* * If userfaultfd_unmap_prep returns an error the vmas * will remain split, but userland will get a * highly unexpected error anyway. This is no * different than the case where the first of the two * __split_vma fails, but we don't undo the first * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(next, vms->start, vms->end, vms->uf); if (error) goto userfaultfd_error; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < vms->start); BUG_ON(next->vm_start > vms->end); #endif } vms->next = vma_next(vms->vmi); if (vms->next) vms->unmap_end = vms->next->vm_start; #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { MA_STATE(test, mas_detach->tree, 0, 0); struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; vma_iter_set(vms->vmi, vms->start); rcu_read_lock(); vma_test = mas_find(&test, vms->vma_count - 1); for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, vms->vma_count - 1); } rcu_read_unlock(); BUG_ON(vms->vma_count != test_count); } #endif while (vma_iter_addr(vms->vmi) > vms->start) vma_iter_prev_range(vms->vmi); vms->clear_ptes = true; return 0; userfaultfd_error: munmap_gather_failed: end_split_failed: modify_vma_failed: reattach_vmas(mas_detach); start_split_failed: map_count_exceeded: return error; } /* * init_vma_munmap() - Initializer wrapper for vma_munmap_struct * @vms: The vma munmap struct * @vmi: The vma iterator * @vma: The first vm_area_struct to munmap * @start: The aligned start address to munmap * @end: The aligned end address to munmap * @uf: The userfaultfd list_head * @unlock: Unlock after the operation. Only unlocked on success */ static void init_vma_munmap(struct vma_munmap_struct *vms, struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { vms->vmi = vmi; vms->vma = vma; if (vma) { vms->start = start; vms->end = end; } else { vms->start = vms->end = 0; } vms->unlock = unlock; vms->uf = uf; vms->vma_count = 0; vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; vms->clear_ptes = false; } /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head * @unlock: Set to true to drop the mmap_lock. unlocking only happens on * success. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { struct maple_tree mt_detach; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(mt_detach); struct vma_munmap_struct vms; int error; init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); error = vms_gather_munmap_vmas(&vms, &mas_detach); if (error) goto gather_failed; error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); if (error) goto clear_tree_failed; /* Point of no return */ vms_complete_munmap_vmas(&vms, &mas_detach); return 0; clear_tree_failed: reattach_vmas(&mas_detach); gather_failed: validate_mm(mm); return error; } /* * do_vmi_munmap() - munmap a given range. * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap * @uf: The userfaultfd list_head * @unlock: set to true if the user wants to drop the mmap_lock on success * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be * aligned. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock) { unsigned long end; struct vm_area_struct *vma; if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; end = start + PAGE_ALIGN(len); if (end == start) return -EINVAL; /* Find the first overlapping VMA */ vma = vma_find(vmi, end); if (!vma) { if (unlock) mmap_write_unlock(mm); return 0; } return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } /* * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd * context and anonymous VMA name within the range [start, end). * * As a result, we might be able to merge the newly modified VMA range with an * adjacent VMA with identical properties. * * If no merge is possible and the range does not span the entirety of the VMA, * we then need to split the VMA to accommodate the change. * * The function returns either the merged VMA, the original VMA if a split was * required instead, or an error if the split failed. */ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->middle; unsigned long start = vmg->start; unsigned long end = vmg->end; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; if (vmg_nomem(vmg)) return ERR_PTR(-ENOMEM); /* * Split can fail for reasons other than OOM, so if the user requests * this it's probably a mistake. */ VM_WARN_ON(vmg->give_up_on_oom && (vma->vm_start != start || vma->vm_end != end)); /* Split any preceding portion of the VMA. */ if (vma->vm_start < start) { int err = split_vma(vmg->vmi, vma, start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ if (vma->vm_end > end) { int err = split_vma(vmg->vmi, vma, end, 0); if (err) return ERR_PTR(err); } return vma; } struct vm_area_struct *vma_modify_flags( struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.anon_name = new_name; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.policy = new_pol; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; return vma_modify(&vmg); } /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. */ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta) { VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); vmg.next = vma_iter_next_rewind(vmi, NULL); vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */ return vma_merge_new_range(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) { vb->count = 0; } static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) { struct address_space *mapping; int i; mapping = vb->vmas[0]->vm_file->f_mapping; i_mmap_lock_write(mapping); for (i = 0; i < vb->count; i++) { VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); __remove_shared_vm_struct(vb->vmas[i], mapping); } i_mmap_unlock_write(mapping); unlink_file_vma_batch_init(vb); } void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma) { if (vma->vm_file == NULL) return; if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || vb->count == ARRAY_SIZE(vb->vmas)) unlink_file_vma_batch_process(vb); vb->vmas[vb->count] = vma; vb->count++; } void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) { if (vb->count > 0) unlink_file_vma_batch_process(vb); } /* * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) { struct file *file = vma->vm_file; if (file) { struct address_space *mapping = file->f_mapping; i_mmap_lock_write(mapping); __remove_shared_vm_struct(vma, mapping); i_mmap_unlock_write(mapping); } } void vma_link_file(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct address_space *mapping; if (file) { mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); } } int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); vma_iter_store_new(&vmi, vma); vma_link_file(vma); mm->map_count++; validate_mm(mm); return 0; } /* * Copy the vma structure to a new location in the same mm, * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } /* * If the VMA we are copying might contain a uprobe PTE, ensure * that we do not establish one upon merge. Otherwise, when mremap() * moves page tables, it will orphan the newly created PTE. */ if (vma->vm_file) vmg.skip_vma_uprobe = true; new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ vmg.middle = NULL; /* New VMA range. */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); new_vma = vma_merge_new_range(&vmg); if (new_vma) { /* * Source vma may have been merged into new_vma */ if (unlikely(vma_start >= new_vma->vm_start && vma_start < new_vma->vm_end)) { /* * The only way we can get a vma_merge with * self during an mremap is if the vma hasn't * been faulted in yet and we were allowed to * reset the dst vma->vm_pgoff to the * destination address of the mremap to allow * the merge to happen. mremap must change the * vm_pgoff linearity between src and dst vmas * (in turn preventing a vma_merge) to be * safe. It is only safe to keep the vm_pgoff * linear if there are no pages mapped yet. */ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); *vmap = vma = new_vma; } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = vm_area_dup(vma); if (!new_vma) goto out; vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } return new_vma; out_vma_link: fixup_hugetlb_reservations(new_vma); vma_close(new_vma); if (new_vma->vm_file) fput(new_vma->vm_file); unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: vm_area_free(new_vma); out: return NULL; } /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ * in things that mprotect may change. * * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that * we can merge the two vma's. For example, we refuse to merge a vma if * there is a vm_ops->close() function, because that indicates that the * driver is doing some kind of reference counting. But that doesn't * really matter for the anon_vma sharing case. */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } /* * Do some basic sanity checking to see if we can re-use the anon_vma * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through * a fork). * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; } return NULL; } /* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive * sequence of mprotects and faults may otherwise lead to distinct * anon_vmas being allocated, preventing vma merge in subsequent * mprotect. */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); /* Try next first. */ next = vma_iter_load(&vmi); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } prev = vma_prev(&vmi); VM_BUG_ON_VMA(prev != vma, vma); prev = vma_prev(&vmi); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, * or lead to too many vmas hanging off the same anon_vma. * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ return anon_vma; } static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) { return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); } static bool vma_is_shared_writable(struct vm_area_struct *vma) { return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == (VM_WRITE | VM_SHARED); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ if (vma->vm_flags & VM_PFNMAP) return false; return vma->vm_file && vma->vm_file->f_mapping && mapping_can_writeback(vma->vm_file->f_mapping); } /* * Does this VMA require the underlying folios to have their dirty state * tracked? */ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) { /* Only shared, writable VMAs require dirty tracking. */ if (!vma_is_shared_writable(vma)) return false; /* Does the filesystem need to be notified? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* * Even if the filesystem doesn't indicate a need for writenotify, if it * can writeback, dirty tracking is still required. */ return vma_fs_can_writeback(vma); } /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change from under us because * we hold the mm_all_locks_mutex. * * Operations on ->flags have to be atomic because * even if AS_MM_ALL_LOCKS is stable thanks to the * mm_all_locks_mutex, there may be other cpus * changing other bitflags in parallel to us. */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } /* * This operation locks against the VM for all pte/vma/mm related * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the * mmap_lock until mm_drop_all_locks() returns. * * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * * We take locks in following order, accordingly to comment at beginning * of mm/rmap.c: * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for * hugetlb mapping); * - all vmas marked locked * - all i_mmap_rwsem locks; * - all anon_vma->rwseml * * We can take all locks within these types randomly because the VM code * doesn't nest them and we protected from parallel mm_take_all_locks() by * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. * * mm_take_all_locks() can fail if it's interrupted by signals. */ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); /* * vma_start_write() does not have a complement in mm_drop_all_locks() * because vma_start_write() is always asymmetrical; it marks a VMA as * being written to until mmap_write_unlock() or mmap_write_downgrade() * is reached. */ for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; vma_start_write(vma); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_lock_anon_vma(mm, avc->anon_vma); } return 0; out_unlock: mm_drop_all_locks(mm); return -EINTR; } static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next * can't change from under us until we release the * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } } static void vm_unlock_mapping(struct address_space *mapping) { if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ i_mmap_unlock_write(mapping); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); } } /* * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for_each_vma(vmi, vma) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } mutex_unlock(&mm_all_locks_mutex); } /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) return false; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } /* * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() * operation. * @vms: The vma unmap structure * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas, free up the maple tree used to track the vmas. * If that's not possible because the ptes are cleared (and vm_ops->closed() may * have been called), then a NULL is written over the vmas and the vmas are * removed (munmap() completed). */ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct ma_state *mas = &vms->vmi->mas; if (!vms->nr_pages) return; if (vms->clear_ptes) return reattach_vmas(mas_detach); /* * Aborting cannot just call the vm_ops open() because they are often * not symmetrical and state data has been lost. Resort to the old * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); /* Clean up the insertion of the unfortunate gap */ vms_complete_munmap_vmas(vms, mas_detach); } /* * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * * @map: Mapping state. * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; struct vma_munmap_struct *vms = &map->vms; /* Find the first overlapping VMA and initialise unmap state. */ vms->vma = vma_find(vmi, map->end); init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, /* unlock = */ false); /* OK, we have overlapping VMAs - prepare to unmap them. */ if (vms->vma) { mt_init_flags(&map->mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(map->mt_detach); mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ error = vms_gather_munmap_vmas(vms, &map->mas_detach); if (error) { /* On error VMAs will already have been reattached. */ vms->nr_pages = 0; return error; } map->next = vms->next; map->prev = vms->prev; } else { map->next = vma_iter_next_rewind(vmi, &map->prev); } /* Check against address space limit. */ if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ if (accountable_mapping(map->file, map->flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { error = security_vm_enough_memory_mm(map->mm, map->charged); if (error) return error; } vms->nr_accounted = 0; map->flags |= VM_ACCOUNT; } /* * Clear PTEs while the vma is still in the tree so that rmap * cannot race with the freeing later in the truncate scenario. * This is also needed for mmap_file(), which is why vm_ops * close function is called. */ vms_clean_up_area(vms, &map->mas_detach); return 0; } static int __mmap_new_file_vma(struct mmap_state *map, struct vm_area_struct *vma) { struct vma_iterator *vmi = map->vmi; int error; vma->vm_file = get_file(map->file); if (!map->file->f_op->mmap) return 0; error = mmap_file(vma->vm_file, vma); if (error) { fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&vmi->mas, vma, map->prev, map->next); return error; } /* Drivers cannot alter the address of the VMA. */ WARN_ON_ONCE(map->addr != vma->vm_start); /* * Drivers should not permit writability when previously it was * disallowed. */ VM_WARN_ON_ONCE(map->flags != vma->vm_flags && !(map->flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); map->flags = vma->vm_flags; return 0; } /* * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not * possible. * * @map: Mapping state. * @vmap: Output pointer for the new VMA. * * Returns: Zero on success, or an error. */ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) { struct vma_iterator *vmi = map->vmi; int error = 0; struct vm_area_struct *vma; /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(map->mm); if (!vma) return -ENOMEM; vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); vm_flags_init(vma, map->flags); vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; goto free_vma; } if (map->file) error = __mmap_new_file_vma(map, vma); else if (map->flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); if (error) goto free_iter_vma; #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ WARN_ON_ONCE(!arch_validate_flags(map->flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); vma_iter_store_new(vmi, vma); map->mm->map_count++; vma_link_file(vma); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; free_iter_vma: vma_iter_free(vmi); free_vma: vm_area_free(vma); return error; } /* * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping * statistics, handle locking and finalise the VMA. * * @map: Mapping state. * @vma: Merged or newly allocated VMA for the mmap()'d region. */ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; unsigned long vm_flags = vma->vm_flags; perf_event_mmap(vma); /* Unmap any existing mapping in the area. */ vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += map->pglen; } if (vma->vm_file) uprobe_mmap(vma); /* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } /* * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that * specifies it. * * This is called prior to any merge attempt, and updates whitelisted fields * that are permitted to be updated by the caller. * * All but user-defined fields will be pre-populated with original values. * * Returns 0 on success, or an error code otherwise. */ static int call_mmap_prepare(struct mmap_state *map) { int err; struct vm_area_desc desc = { .mm = map->mm, .start = map->addr, .end = map->end, .pgoff = map->pgoff, .file = map->file, .vm_flags = map->flags, .page_prot = map->page_prot, }; /* Invoke the hook. */ err = __call_mmap_prepare(map->file, &desc); if (err) return err; /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; map->file = desc.file; map->flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ map->vm_ops = desc.vm_ops; map->vm_private_data = desc.private_data; return 0; } static void set_vma_user_defined_fields(struct vm_area_struct *vma, struct mmap_state *map) { if (map->vm_ops) vma->vm_ops = map->vm_ops; vma->vm_private_data = map->vm_private_data; } static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; int error; bool have_mmap_prepare = file && file->f_op->mmap_prepare; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); error = __mmap_prepare(&map, uf); if (!error && have_mmap_prepare) error = call_mmap_prepare(&map); if (error) goto abort_munmap; /* Attempt to merge with adjacent VMAs... */ if (map.prev || map.next) { VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); vma = vma_merge_new_range(&vmg); } /* ...but if we can't, allocate a new VMA. */ if (!vma) { error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; } if (have_mmap_prepare) set_vma_user_defined_fields(vma, &map); __mmap_complete(&map, vma); return addr; /* Accounting was done by __mmap_prepare(). */ unacct_error: if (map.charged) vm_unacct_memory(map.charged); abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } /** * mmap_region() - Actually perform the userland mapping of a VMA into * current->mm with known, aligned and overflow-checked @addr and @len, and * correctly determined VMA flags @vm_flags and page offset @pgoff. * * This is an internal memory management function, and should not be used * directly. * * The caller must write-lock current->mm->mmap_lock. * * @file: If a file-backed mapping, a pointer to the struct file describing the * file to be mapped, otherwise NULL. * @addr: The page-aligned address at which to perform the mapping. * @len: The page-aligned, non-zero, length of the mapping. * @vm_flags: The VMA flags which should be applied to the mapping. * @pgoff: If @file is specified, the page offset into the file, if not then * the virtual page offset in memory of the anonymous mapping. * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap * events. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ if (map_deny_write_exec(vm_flags, vm_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ if (!arch_validate_flags(vm_flags)) return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ if (file && is_shared_maywrite(vm_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) return error; writable_file_mapping = true; } ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); validate_mm(current->mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, * @flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; /* * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ vmg.just_expand = true; if (vma_merge_new_range(&vmg)) goto out; else if (vmg_nomem(&vmg)) goto unacct_fail; } if (vma) vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) goto unacct_fail; vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: vm_area_free(vma); unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area(struct vm_unmapped_area_info *info) { unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; /* * Adjust for the gap first so it doesn't interfere with the * later alignment. The first step is the minimum needed to * fulill the start gap, the next steps is the minimum to align * that. It is the minimum needed to fulill both. */ gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); vma_iter_reset(&vmi); goto retry; } } return gap; } /** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) return -ENOMEM; gap = vma_iter_end(&vmi) - info->length; gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; vma_iter_reset(&vmi); goto retry; } } return gap; } /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the * grow-up and grow-down cases. */ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : vma->vm_end - size; if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; return 0; } #if defined(CONFIG_STACK_GROWSUP) /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; unsigned long gap_addr; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; mmap_assert_write_locked(mm); /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; /* Enforce stack_guard_gap */ gap_addr = address + stack_guard_gap; /* Guard against overflow */ if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } if (next) vma_iter_prev_range_limit(&vmi, address); vma_iter_config(&vmi, vma->vm_start, address); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { unsigned long size, grow; size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; error = -ENOMEM; if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP */ /* * vma is the first one with address < vma->vm_start. Have to extend vma. * mmap_lock held for writing. */ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; mmap_assert_write_locked(mm); address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { if (!(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; } if (prev) vma_iter_next_range_limit(&vmi, vma->vm_start); vma_iter_config(&vmi, address, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { unsigned long size, grow; size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; error = -ENOMEM; if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ vma_iter_store_overwrite(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } int __vm_munmap(unsigned long start, size_t len, bool unlock) { int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); if (ret || !unlock) mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); return ret; } /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long charged = vma_pages(vma); if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index * are set. But now set the vm_pgoff it will almost certainly * end up with (unless mremap moves it elsewhere before that * first wfault), so /proc/pid/maps tells a consistent story. * * By setting it to reflect the virtual start address of the * vma, merges and splits can happen in a seamless way, just * using the existing file pgoff checks and manipulations. * Similarly in do_mmap and in do_brk_flags. */ if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } if (vma_link(mm, vma)) { if (vma->vm_flags & VM_ACCOUNT) vm_unacct_memory(charged); return -ENOMEM; } return 0; }
434 335 419 24 25 434 420 25 1542 1543 1 406 1546 410 1508 433 54 1540 7 23 1541 1 1 1546 1538 6 16 1541 1549 29 1545 1545 24 24 1559 1552 24 1560 1557 1544 1333 1562 1564 1 1 1 1 7 8 7 1 1555 1560 1564 1559 1555 54 54 1557 1554 1561 1538 1559 1538 1554 1563 1542 1541 409 1539 24 24 24 1560 8 8 1541 1542 25 25 25 1515 314 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/vsprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */ /* * Wirzenius wrote this portably, Torvalds fucked it up :-) */ /* * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com> * - changed to provide snprintf and vsnprintf functions * So Feb 1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de> * - scnprintf and vscnprintf */ #include <linux/stdarg.h> #include <linux/build_bug.h> #include <linux/clk.h> #include <linux/clk-provider.h> #include <linux/errname.h> #include <linux/module.h> /* for KSYM_SYMBOL_LEN */ #include <linux/types.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/kernel.h> #include <linux/kallsyms.h> #include <linux/math64.h> #include <linux/uaccess.h> #include <linux/ioport.h> #include <linux/dcache.h> #include <linux/cred.h> #include <linux/rtc.h> #include <linux/sprintf.h> #include <linux/time.h> #include <linux/uuid.h> #include <linux/of.h> #include <net/addrconf.h> #include <linux/siphash.h> #include <linux/compiler.h> #include <linux/property.h> #include <linux/notifier.h> #ifdef CONFIG_BLOCK #include <linux/blkdev.h> #endif #include "../mm/internal.h" /* For the trace_print_flags arrays */ #include <asm/page.h> /* for PAGE_SIZE */ #include <asm/byteorder.h> /* cpu_to_le16 */ #include <linux/unaligned.h> #include <linux/string_helpers.h> #include "kstrtox.h" /* Disable pointer hashing if requested */ bool no_hash_pointers __ro_after_init; EXPORT_SYMBOL_GPL(no_hash_pointers); noinline static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars) { const char *cp; unsigned long long result = 0ULL; size_t prefix_chars; unsigned int rv; cp = _parse_integer_fixup_radix(startp, &base); prefix_chars = cp - startp; if (prefix_chars < max_chars) { rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars); /* FIXME */ cp += (rv & ~KSTRTOX_OVERFLOW); } else { /* Field too short for prefix + digit, skip over without converting */ cp = startp + max_chars; } if (endp) *endp = (char *)cp; return result; } /** * simple_strtoull - convert a string to an unsigned long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoull instead. */ noinline unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base) { return simple_strntoull(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoull); /** * simple_strtoul - convert a string to an unsigned long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoul instead. */ unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base) { return simple_strtoull(cp, endp, base); } EXPORT_SYMBOL(simple_strtoul); unsigned long simple_strntoul(const char *cp, char **endp, unsigned int base, size_t max_chars) { return simple_strntoull(cp, endp, base, max_chars); } EXPORT_SYMBOL(simple_strntoul); /** * simple_strtol - convert a string to a signed long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtol instead. */ long simple_strtol(const char *cp, char **endp, unsigned int base) { if (*cp == '-') return -simple_strtoul(cp + 1, endp, base); return simple_strtoul(cp, endp, base); } EXPORT_SYMBOL(simple_strtol); noinline static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars) { /* * simple_strntoull() safely handles receiving max_chars==0 in the * case cp[0] == '-' && max_chars == 1. * If max_chars == 0 we can drop through and pass it to simple_strntoull() * and the content of *cp is irrelevant. */ if (*cp == '-' && max_chars > 0) return -simple_strntoull(cp + 1, endp, base, max_chars - 1); return simple_strntoull(cp, endp, base, max_chars); } /** * simple_strtoll - convert a string to a signed long long * @cp: The start of the string * @endp: A pointer to the end of the parsed string will be placed here * @base: The number base to use * * This function has caveats. Please use kstrtoll instead. */ long long simple_strtoll(const char *cp, char **endp, unsigned int base) { return simple_strntoll(cp, endp, base, INT_MAX); } EXPORT_SYMBOL(simple_strtoll); static inline int skip_atoi(const char **s) { int i = 0; do { i = i*10 + *((*s)++) - '0'; } while (isdigit(**s)); return i; } /* * Decimal conversion is by far the most typical, and is used for * /proc and /sys data. This directly impacts e.g. top performance * with many processes running. We optimize it for speed by emitting * two characters at a time, using a 200 byte lookup table. This * roughly halves the number of multiplications compared to computing * the digits one at a time. Implementation strongly inspired by the * previous version, which in turn used ideas described at * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission * from the author, Douglas W. Jones). * * It turns out there is precisely one 26 bit fixed-point * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual * range happens to be somewhat larger (x <= 1073741898), but that's * irrelevant for our purpose. * * For dividing a number in the range [10^4, 10^6-1] by 100, we still * need a 32x32->64 bit multiply, so we simply use the same constant. * * For dividing a number in the range [100, 10^4-1] by 100, there are * several options. The simplest is (x * 0x147b) >> 19, which is valid * for all x <= 43698. */ static const u16 decpair[100] = { #define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030) _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9), _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19), _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29), _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39), _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49), _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59), _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69), _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79), _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89), _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99), #undef _ }; /* * This will print a single '0' even if r == 0, since we would * immediately jump to out_r where two 0s would be written but only * one of them accounted for in buf. This is needed by ip4_string * below. All other callers pass a non-zero value of r. */ static noinline_for_stack char *put_dec_trunc8(char *buf, unsigned r) { unsigned q; /* 1 <= r < 10^8 */ if (r < 100) goto out_r; /* 100 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 1 <= q < 10^6 */ if (q < 100) goto out_q; /* 100 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 1 <= r < 10^4 */ if (r < 100) goto out_r; /* 100 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; out_q: /* 1 <= q < 100 */ r = q; out_r: /* 1 <= r < 100 */ *((u16 *)buf) = decpair[r]; buf += r < 10 ? 1 : 2; return buf; } #if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64 static noinline_for_stack char *put_dec_full8(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^8 */ q = (r * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 10^6 */ r = (q * (u64)0x28f5c29) >> 32; *((u16 *)buf) = decpair[q - 100*r]; buf += 2; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; buf += 2; return buf; } static noinline_for_stack char *put_dec(char *buf, unsigned long long n) { if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n <= 1.6e11 */ if (n >= 100*1000*1000) buf = put_dec_full8(buf, do_div(n, 100*1000*1000)); /* 1 <= n < 1e8 */ return put_dec_trunc8(buf, n); } #elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64 static void put_dec_full4(char *buf, unsigned r) { unsigned q; /* 0 <= r < 10^4 */ q = (r * 0x147b) >> 19; *((u16 *)buf) = decpair[r - 100*q]; buf += 2; /* 0 <= q < 100 */ *((u16 *)buf) = decpair[q]; } /* * Call put_dec_full4 on x % 10000, return x / 10000. * The approximation x/10000 == (x * 0x346DC5D7) >> 43 * holds for all x < 1,128,869,999. The largest value this * helper will ever be asked to convert is 1,125,520,955. * (second call in the put_dec code, assuming n is all-ones). */ static noinline_for_stack unsigned put_dec_helper4(char *buf, unsigned x) { uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43; put_dec_full4(buf, x - q * 10000); return q; } /* Based on code by Douglas W. Jones found at * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour> * (with permission from the author). * Performs no 64-bit division and hence should be fast on 32-bit machines. */ static char *put_dec(char *buf, unsigned long long n) { uint32_t d3, d2, d1, q, h; if (n < 100*1000*1000) return put_dec_trunc8(buf, n); d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ h = (n >> 32); d2 = (h ) & 0xffff; d3 = (h >> 16); /* implicit "& 0xffff" */ /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0 = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */ q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); q = put_dec_helper4(buf, q); q += 7671 * d3 + 9496 * d2 + 6 * d1; q = put_dec_helper4(buf+4, q); q += 4749 * d3 + 42 * d2; q = put_dec_helper4(buf+8, q); q += 281 * d3; buf += 12; if (q) buf = put_dec_trunc8(buf, q); else while (buf[-1] == '0') --buf; return buf; } #endif /* * Convert passed number to decimal string. * Returns the length of string. On buffer overflow, returns 0. * * If speed is not important, use snprintf(). It's easy to read the code. */ int num_to_str(char *buf, int size, unsigned long long num, unsigned int width) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[sizeof(num) * 3] __aligned(2); int idx, len; /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ if (num <= 9) { tmp[0] = '0' + num; len = 1; } else { len = put_dec(tmp, num) - tmp; } if (len > size || width > size) return 0; if (width > len) { width = width - len; for (idx = 0; idx < width; idx++) buf[idx] = ' '; } else { width = 0; } for (idx = 0; idx < len; ++idx) buf[idx + width] = tmp[len - idx - 1]; return len + width; } #define SIGN 1 /* unsigned/signed */ #define LEFT 2 /* left justified */ #define PLUS 4 /* show plus */ #define SPACE 8 /* space if plus */ #define ZEROPAD 16 /* pad with zero, must be 16 == '0' - ' ' */ #define SMALL 32 /* use lowercase in hex (must be 32 == 0x20) */ #define SPECIAL 64 /* prefix hex with "0x", octal with "0" */ static_assert(ZEROPAD == ('0' - ' ')); static_assert(SMALL == ('a' ^ 'A')); enum format_state { FORMAT_STATE_NONE, /* Just a string part */ FORMAT_STATE_NUM, FORMAT_STATE_WIDTH, FORMAT_STATE_PRECISION, FORMAT_STATE_CHAR, FORMAT_STATE_STR, FORMAT_STATE_PTR, FORMAT_STATE_PERCENT_CHAR, FORMAT_STATE_INVALID, }; struct printf_spec { unsigned char flags; /* flags to number() */ unsigned char base; /* number base, 8, 10 or 16 only */ short precision; /* # of digits/chars */ int field_width; /* width of output field */ } __packed; static_assert(sizeof(struct printf_spec) == 8); #define FIELD_WIDTH_MAX ((1 << 23) - 1) #define PRECISION_MAX ((1 << 15) - 1) static noinline_for_stack char *number(char *buf, char *end, unsigned long long num, struct printf_spec spec) { /* put_dec requires 2-byte alignment of the buffer. */ char tmp[3 * sizeof(num)] __aligned(2); char sign; char locase; int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); int i; bool is_zero = num == 0LL; int field_width = spec.field_width; int precision = spec.precision; /* locase = 0 or 0x20. ORing digits or letters with 'locase' * produces same digits or (maybe lowercased) letters */ locase = (spec.flags & SMALL); if (spec.flags & LEFT) spec.flags &= ~ZEROPAD; sign = 0; if (spec.flags & SIGN) { if ((signed long long)num < 0) { sign = '-'; num = -(signed long long)num; field_width--; } else if (spec.flags & PLUS) { sign = '+'; field_width--; } else if (spec.flags & SPACE) { sign = ' '; field_width--; } } if (need_pfx) { if (spec.base == 16) field_width -= 2; else if (!is_zero) field_width--; } /* generate full string in tmp[], in reverse order */ i = 0; if (num < spec.base) tmp[i++] = hex_asc_upper[num] | locase; else if (spec.base != 10) { /* 8 or 16 */ int mask = spec.base - 1; int shift = 3; if (spec.base == 16) shift = 4; do { tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase); num >>= shift; } while (num); } else { /* base 10 */ i = put_dec(tmp, num) - tmp; } /* printing 100 using %2d gives "100", not "00" */ if (i > precision) precision = i; /* leading space padding */ field_width -= precision; if (!(spec.flags & (ZEROPAD | LEFT))) { while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } } /* sign */ if (sign) { if (buf < end) *buf = sign; ++buf; } /* "0x" / "0" prefix */ if (need_pfx) { if (spec.base == 16 || !is_zero) { if (buf < end) *buf = '0'; ++buf; } if (spec.base == 16) { if (buf < end) *buf = ('X' | locase); ++buf; } } /* zero or space padding */ if (!(spec.flags & LEFT)) { char c = ' ' + (spec.flags & ZEROPAD); while (--field_width >= 0) { if (buf < end) *buf = c; ++buf; } } /* hmm even more zero padding? */ while (i <= --precision) { if (buf < end) *buf = '0'; ++buf; } /* actual digits of result */ while (--i >= 0) { if (buf < end) *buf = tmp[i]; ++buf; } /* trailing space padding */ while (--field_width >= 0) { if (buf < end) *buf = ' '; ++buf; } return buf; } static noinline_for_stack char *special_hex_number(char *buf, char *end, unsigned long long num, int size) { struct printf_spec spec; spec.field_width = 2 + 2 * size; /* 0x + hex */ spec.flags = SPECIAL | SMALL | ZEROPAD; spec.base = 16; spec.precision = -1; return number(buf, end, num, spec); } static void move_right(char *buf, char *end, unsigned len, unsigned spaces) { size_t size; if (buf >= end) /* nowhere to put anything */ return; size = end - buf; if (size <= spaces) { memset(buf, ' ', size); return; } if (len) { if (len > size - spaces) len = size - spaces; memmove(buf + spaces, buf, len); } memset(buf, ' ', spaces); } /* * Handle field width padding for a string. * @buf: current buffer position * @n: length of string * @end: end of output buffer * @spec: for field width and flags * Returns: new buffer position after padding. */ static noinline_for_stack char *widen_string(char *buf, int n, char *end, struct printf_spec spec) { unsigned spaces; if (likely(n >= spec.field_width)) return buf; /* we want to pad the sucker */ spaces = spec.field_width - n; if (!(spec.flags & LEFT)) { move_right(buf - n, end, n, spaces); return buf + spaces; } while (spaces--) { if (buf < end) *buf = ' '; ++buf; } return buf; } /* Handle string from a well known address. */ static char *string_nocheck(char *buf, char *end, const char *s, struct printf_spec spec) { int len = 0; int lim = spec.precision; while (lim--) { char c = *s++; if (!c) break; if (buf < end) *buf = c; ++buf; ++len; } return widen_string(buf, len, end, spec); } static char *err_ptr(char *buf, char *end, void *ptr, struct printf_spec spec) { int err = PTR_ERR(ptr); const char *sym = errname(err); if (sym) return string_nocheck(buf, end, sym, spec); /* * Somebody passed ERR_PTR(-1234) or some other non-existing * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to * printing it as its decimal representation. */ spec.flags |= SIGN; spec.base = 10; return number(buf, end, err, spec); } /* Be careful: error messages must fit into the given buffer. */ static char *error_string(char *buf, char *end, const char *s, struct printf_spec spec) { /* * Hard limit to avoid a completely insane messages. It actually * works pretty well because most error messages are in * the many pointer format modifiers. */ if (spec.precision == -1) spec.precision = 2 * sizeof(void *); return string_nocheck(buf, end, s, spec); } /* * Do not call any complex external code here. Nested printk()/vsprintf() * might cause infinite loops. Failures might break printk() and would * be hard to debug. */ static const char *check_pointer_msg(const void *ptr) { if (!ptr) return "(null)"; if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr)) return "(efault)"; return NULL; } static int check_pointer(char **buf, char *end, const void *ptr, struct printf_spec spec) { const char *err_msg; err_msg = check_pointer_msg(ptr); if (err_msg) { *buf = error_string(*buf, end, err_msg, spec); return -EFAULT; } return 0; } static noinline_for_stack char *string(char *buf, char *end, const char *s, struct printf_spec spec) { if (check_pointer(&buf, end, s, spec)) return buf; return string_nocheck(buf, end, s, spec); } static char *pointer_string(char *buf, char *end, const void *ptr, struct printf_spec spec) { spec.base = 16; spec.flags |= SMALL; if (spec.field_width == -1) { spec.field_width = 2 * sizeof(ptr); spec.flags |= ZEROPAD; } return number(buf, end, (unsigned long int)ptr, spec); } /* Make pointers available for printing early in the boot sequence. */ static int debug_boot_weak_hash __ro_after_init; static int __init debug_boot_weak_hash_enable(char *str) { debug_boot_weak_hash = 1; pr_info("debug_boot_weak_hash enabled\n"); return 0; } early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable); static bool filled_random_ptr_key __read_mostly; static siphash_key_t ptr_key __read_mostly; static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data) { get_random_bytes(&ptr_key, sizeof(ptr_key)); /* Pairs with smp_rmb() before reading ptr_key. */ smp_wmb(); WRITE_ONCE(filled_random_ptr_key, true); return NOTIFY_DONE; } static int __init vsprintf_init_hashval(void) { static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key }; execute_with_initialized_rng(&fill_ptr_key_nb); return 0; } subsys_initcall(vsprintf_init_hashval) /* Maps a pointer to a 32 bit unique identifier. */ static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { unsigned long hashval; if (!READ_ONCE(filled_random_ptr_key)) return -EBUSY; /* Pairs with smp_wmb() after writing ptr_key. */ smp_rmb(); #ifdef CONFIG_64BIT hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); /* * Mask off the first 32 bits, this makes explicit that we have * modified the address (and 32 bits is plenty for a unique ID). */ hashval = hashval & 0xffffffff; #else hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); #endif *hashval_out = hashval; return 0; } int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) { return __ptr_to_hashval(ptr, hashval_out); } static char *ptr_to_id(char *buf, char *end, const void *ptr, struct printf_spec spec) { const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; unsigned long hashval; int ret; /* * Print the real pointer value for NULL and error pointers, * as they are not actual addresses. */ if (IS_ERR_OR_NULL(ptr)) return pointer_string(buf, end, ptr, spec); /* When debugging early boot use non-cryptographically secure hash. */ if (unlikely(debug_boot_weak_hash)) { hashval = hash_long((unsigned long)ptr, 32); return pointer_string(buf, end, (const void *)hashval, spec); } ret = __ptr_to_hashval(ptr, &hashval); if (ret) { spec.field_width = 2 * sizeof(ptr); /* string length must be less than default_width */ return error_string(buf, end, str, spec); } return pointer_string(buf, end, (const void *)hashval, spec); } static char *default_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { /* * default is to _not_ leak addresses, so hash before printing, * unless no_hash_pointers is specified on the command line. */ if (unlikely(no_hash_pointers)) return pointer_string(buf, end, ptr, spec); return ptr_to_id(buf, end, ptr, spec); } int kptr_restrict __read_mostly; static noinline_for_stack char *restricted_pointer(char *buf, char *end, const void *ptr, struct printf_spec spec) { switch (kptr_restrict) { case 0: /* Handle as %p, hash and do _not_ leak addresses. */ return default_pointer(buf, end, ptr, spec); case 1: { const struct cred *cred; /* * kptr_restrict==1 cannot be used in IRQ context * because its test for CAP_SYSLOG would be meaningless. */ if (in_hardirq() || in_serving_softirq() || in_nmi()) { if (spec.field_width == -1) spec.field_width = 2 * sizeof(ptr); return error_string(buf, end, "pK-error", spec); } /* * Only print the real pointer value if the current * process has CAP_SYSLOG and is running with the * same credentials it started with. This is because * access to files is checked at open() time, but %pK * checks permission at read() time. We don't want to * leak pointer values if a binary opens a file using * %pK and then elevates privileges before reading it. */ cred = current_cred(); if (!has_capability_noaudit(current, CAP_SYSLOG) || !uid_eq(cred->euid, cred->uid) || !gid_eq(cred->egid, cred->gid)) ptr = NULL; break; } case 2: default: /* Always print 0's for %pK */ ptr = NULL; break; } return pointer_string(buf, end, ptr, spec); } static noinline_for_stack char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, const char *fmt) { const char *array[4], *s; const struct dentry *p; int depth; int i, n; switch (fmt[1]) { case '2': case '3': case '4': depth = fmt[1] - '0'; break; default: depth = 1; } rcu_read_lock(); for (i = 0; i < depth; i++, d = p) { if (check_pointer(&buf, end, d, spec)) { rcu_read_unlock(); return buf; } p = READ_ONCE(d->d_parent); array[i] = READ_ONCE(d->d_name.name); if (p == d) { if (i) array[i] = ""; i++; break; } } s = array[--i]; for (n = 0; n != spec.precision; n++, buf++) { char c = *s++; if (!c) { if (!i) break; c = '/'; s = array[--i]; } if (buf < end) *buf = c; } rcu_read_unlock(); return widen_string(buf, n, end, spec); } static noinline_for_stack char *file_dentry_name(char *buf, char *end, const struct file *f, struct printf_spec spec, const char *fmt) { if (check_pointer(&buf, end, f, spec)) return buf; return dentry_name(buf, end, f->f_path.dentry, spec, fmt); } #ifdef CONFIG_BLOCK static noinline_for_stack char *bdev_name(char *buf, char *end, struct block_device *bdev, struct printf_spec spec, const char *fmt) { struct gendisk *hd; if (check_pointer(&buf, end, bdev, spec)) return buf; hd = bdev->bd_disk; buf = string(buf, end, hd->disk_name, spec); if (bdev_is_partition(bdev)) { if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) { if (buf < end) *buf = 'p'; buf++; } buf = number(buf, end, bdev_partno(bdev), spec); } return buf; } #endif static noinline_for_stack char *symbol_string(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { unsigned long value; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; #endif if (fmt[1] == 'R') ptr = __builtin_extract_return_addr(ptr); value = (unsigned long)ptr; #ifdef CONFIG_KALLSYMS if (*fmt == 'B' && fmt[1] == 'b') sprint_backtrace_build_id(sym, value); else if (*fmt == 'B') sprint_backtrace(sym, value); else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b'))) sprint_symbol_build_id(sym, value); else if (*fmt != 's') sprint_symbol(sym, value); else sprint_symbol_no_offset(sym, value); return string_nocheck(buf, end, sym, spec); #else return special_hex_number(buf, end, value, sizeof(void *)); #endif } static const struct printf_spec default_str_spec = { .field_width = -1, .precision = -1, }; static const struct printf_spec default_flag_spec = { .base = 16, .precision = -1, .flags = SPECIAL | SMALL, }; static const struct printf_spec default_dec_spec = { .base = 10, .precision = -1, }; static const struct printf_spec default_dec02_spec = { .base = 10, .field_width = 2, .precision = -1, .flags = ZEROPAD, }; static const struct printf_spec default_dec04_spec = { .base = 10, .field_width = 4, .precision = -1, .flags = ZEROPAD, }; static noinline_for_stack char *hex_range(char *buf, char *end, u64 start_val, u64 end_val, struct printf_spec spec) { buf = number(buf, end, start_val, spec); if (start_val == end_val) return buf; if (buf < end) *buf = '-'; ++buf; return number(buf, end, end_val, spec); } static noinline_for_stack char *resource_string(char *buf, char *end, struct resource *res, struct printf_spec spec, const char *fmt) { #ifndef IO_RSRC_PRINTK_SIZE #define IO_RSRC_PRINTK_SIZE 6 #endif #ifndef MEM_RSRC_PRINTK_SIZE #define MEM_RSRC_PRINTK_SIZE 10 #endif static const struct printf_spec io_spec = { .base = 16, .field_width = IO_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec mem_spec = { .base = 16, .field_width = MEM_RSRC_PRINTK_SIZE, .precision = -1, .flags = SPECIAL | SMALL | ZEROPAD, }; static const struct printf_spec bus_spec = { .base = 16, .field_width = 2, .precision = -1, .flags = SMALL | ZEROPAD, }; static const struct printf_spec str_spec = { .field_width = -1, .precision = 10, .flags = LEFT, }; /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8) * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */ #define RSRC_BUF_SIZE ((2 * sizeof(resource_size_t)) + 4) #define FLAG_BUF_SIZE (2 * sizeof(res->flags)) #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]") #define RAW_BUF_SIZE sizeof("[mem - flags 0x]") char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE, 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)]; char *p = sym, *pend = sym + sizeof(sym); int decode = (fmt[0] == 'R') ? 1 : 0; const struct printf_spec *specp; if (check_pointer(&buf, end, res, spec)) return buf; *p++ = '['; if (res->flags & IORESOURCE_IO) { p = string_nocheck(p, pend, "io ", str_spec); specp = &io_spec; } else if (res->flags & IORESOURCE_MEM) { p = string_nocheck(p, pend, "mem ", str_spec); specp = &mem_spec; } else if (res->flags & IORESOURCE_IRQ) { p = string_nocheck(p, pend, "irq ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_DMA) { p = string_nocheck(p, pend, "dma ", str_spec); specp = &default_dec_spec; } else if (res->flags & IORESOURCE_BUS) { p = string_nocheck(p, pend, "bus ", str_spec); specp = &bus_spec; } else { p = string_nocheck(p, pend, "??? ", str_spec); specp = &mem_spec; decode = 0; } if (decode && res->flags & IORESOURCE_UNSET) { p = string_nocheck(p, pend, "size ", str_spec); p = number(p, pend, resource_size(res), *specp); } else { p = hex_range(p, pend, res->start, res->end, *specp); } if (decode) { if (res->flags & IORESOURCE_MEM_64) p = string_nocheck(p, pend, " 64bit", str_spec); if (res->flags & IORESOURCE_PREFETCH) p = string_nocheck(p, pend, " pref", str_spec); if (res->flags & IORESOURCE_WINDOW) p = string_nocheck(p, pend, " window", str_spec); if (res->flags & IORESOURCE_DISABLED) p = string_nocheck(p, pend, " disabled", str_spec); } else { p = string_nocheck(p, pend, " flags ", str_spec); p = number(p, pend, res->flags, default_flag_spec); } *p++ = ']'; *p = '\0'; return string_nocheck(buf, end, sym, spec); } static noinline_for_stack char *range_string(char *buf, char *end, const struct range *range, struct printf_spec spec, const char *fmt) { char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")]; char *p = sym, *pend = sym + sizeof(sym); struct printf_spec range_spec = { .field_width = 2 + 2 * sizeof(range->start), /* 0x + 2 * 8 */ .flags = SPECIAL | SMALL | ZEROPAD, .base = 16, .precision = -1, }; if (check_pointer(&buf, end, range, spec)) return buf; p = string_nocheck(p, pend, "[range ", default_str_spec); p = hex_range(p, pend, range->start, range->end, range_spec); *p++ = ']'; *p = '\0'; return string_nocheck(buf, end, sym, spec); } static noinline_for_stack char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { int i, len = 1; /* if we pass '%ph[CDN]', field width remains negative value, fallback to the default */ char separator; if (spec.field_width == 0) /* nothing to print */ return buf; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'C': separator = ':'; break; case 'D': separator = '-'; break; case 'N': separator = 0; break; default: separator = ' '; break; } if (spec.field_width > 0) len = min_t(int, spec.field_width, 64); for (i = 0; i < len; ++i) { if (buf < end) *buf = hex_asc_hi(addr[i]); ++buf; if (buf < end) *buf = hex_asc_lo(addr[i]); ++buf; if (separator && i != len - 1) { if (buf < end) *buf = separator; ++buf; } } return buf; } static noinline_for_stack char *bitmap_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { const int CHUNKSZ = 32; int nr_bits = max_t(int, spec.field_width, 0); int i, chunksz; bool first = true; if (check_pointer(&buf, end, bitmap, spec)) return buf; /* reused to print numbers */ spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 }; chunksz = nr_bits & (CHUNKSZ - 1); if (chunksz == 0) chunksz = CHUNKSZ; i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ; for (; i >= 0; i -= CHUNKSZ) { u32 chunkmask, val; int word, bit; chunkmask = ((1ULL << chunksz) - 1); word = i / BITS_PER_LONG; bit = i % BITS_PER_LONG; val = (bitmap[word] >> bit) & chunkmask; if (!first) { if (buf < end) *buf = ','; buf++; } first = false; spec.field_width = DIV_ROUND_UP(chunksz, 4); buf = number(buf, end, val, spec); chunksz = CHUNKSZ; } return buf; } static noinline_for_stack char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap, struct printf_spec spec, const char *fmt) { int nr_bits = max_t(int, spec.field_width, 0); bool first = true; int rbot, rtop; if (check_pointer(&buf, end, bitmap, spec)) return buf; for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { if (!first) { if (buf < end) *buf = ','; buf++; } first = false; buf = number(buf, end, rbot, default_dec_spec); if (rtop == rbot + 1) continue; if (buf < end) *buf = '-'; buf = number(++buf, end, rtop - 1, default_dec_spec); } return buf; } static noinline_for_stack char *mac_address_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")]; char *p = mac_addr; int i; char separator; bool reversed = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': separator = '-'; break; case 'R': reversed = true; fallthrough; default: separator = ':'; break; } for (i = 0; i < 6; i++) { if (reversed) p = hex_byte_pack(p, addr[5 - i]); else p = hex_byte_pack(p, addr[i]); if (fmt[0] == 'M' && i != 5) *p++ = separator; } *p = '\0'; return string_nocheck(buf, end, mac_addr, spec); } static noinline_for_stack char *ip4_string(char *p, const u8 *addr, const char *fmt) { int i; bool leading_zeros = (fmt[0] == 'i'); int index; int step; switch (fmt[2]) { case 'h': #ifdef __BIG_ENDIAN index = 0; step = 1; #else index = 3; step = -1; #endif break; case 'l': index = 3; step = -1; break; case 'n': case 'b': default: index = 0; step = 1; break; } for (i = 0; i < 4; i++) { char temp[4] __aligned(2); /* hold each IP quad in reverse order */ int digits = put_dec_trunc8(temp, addr[index]) - temp; if (leading_zeros) { if (digits < 3) *p++ = '0'; if (digits < 2) *p++ = '0'; } /* reverse the digits in the quad */ while (digits--) *p++ = temp[digits]; if (i < 3) *p++ = '.'; index += step; } *p = '\0'; return p; } static noinline_for_stack char *ip6_compressed_string(char *p, const char *addr) { int i, j, range; unsigned char zerolength[8]; int longest = 1; int colonpos = -1; u16 word; u8 hi, lo; bool needcolon = false; bool useIPv4; struct in6_addr in6; memcpy(&in6, addr, sizeof(struct in6_addr)); useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6); memset(zerolength, 0, sizeof(zerolength)); if (useIPv4) range = 6; else range = 8; /* find position of longest 0 run */ for (i = 0; i < range; i++) { for (j = i; j < range; j++) { if (in6.s6_addr16[j] != 0) break; zerolength[i]++; } } for (i = 0; i < range; i++) { if (zerolength[i] > longest) { longest = zerolength[i]; colonpos = i; } } if (longest == 1) /* don't compress a single 0 */ colonpos = -1; /* emit address */ for (i = 0; i < range; i++) { if (i == colonpos) { if (needcolon || i == 0) *p++ = ':'; *p++ = ':'; needcolon = false; i += longest - 1; continue; } if (needcolon) { *p++ = ':'; needcolon = false; } /* hex u16 without leading 0s */ word = ntohs(in6.s6_addr16[i]); hi = word >> 8; lo = word & 0xff; if (hi) { if (hi > 0x0f) p = hex_byte_pack(p, hi); else *p++ = hex_asc_lo(hi); p = hex_byte_pack(p, lo); } else if (lo > 0x0f) p = hex_byte_pack(p, lo); else *p++ = hex_asc_lo(lo); needcolon = true; } if (useIPv4) { if (needcolon) *p++ = ':'; p = ip4_string(p, &in6.s6_addr[12], "I4"); } *p = '\0'; return p; } static noinline_for_stack char *ip6_string(char *p, const char *addr, const char *fmt) { int i; for (i = 0; i < 8; i++) { p = hex_byte_pack(p, *addr++); p = hex_byte_pack(p, *addr++); if (fmt[0] == 'I' && i != 7) *p++ = ':'; } *p = '\0'; return p; } static noinline_for_stack char *ip6_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")]; if (fmt[0] == 'I' && fmt[2] == 'c') ip6_compressed_string(ip6_addr, addr); else ip6_string(ip6_addr, addr, fmt); return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char ip4_addr[sizeof("255.255.255.255")]; ip4_string(ip4_addr, addr, fmt); return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa, struct printf_spec spec, const char *fmt) { bool have_p = false, have_s = false, have_f = false, have_c = false; char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") + sizeof(":12345") + sizeof("/123456789") + sizeof("%1234567890")]; char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr); const u8 *addr = (const u8 *) &sa->sin6_addr; char fmt6[2] = { fmt[0], '6' }; u8 off = 0; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'f': have_f = true; break; case 's': have_s = true; break; case 'c': have_c = true; break; } } if (have_p || have_s || have_f) { *p = '['; off = 1; } if (fmt6[0] == 'I' && have_c) p = ip6_compressed_string(ip6_addr + off, addr); else p = ip6_string(ip6_addr + off, addr, fmt6); if (have_p || have_s || have_f) *p++ = ']'; if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin6_port), spec); } if (have_f) { *p++ = '/'; p = number(p, pend, ntohl(sa->sin6_flowinfo & IPV6_FLOWINFO_MASK), spec); } if (have_s) { *p++ = '%'; p = number(p, pend, sa->sin6_scope_id, spec); } *p = '\0'; return string_nocheck(buf, end, ip6_addr, spec); } static noinline_for_stack char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa, struct printf_spec spec, const char *fmt) { bool have_p = false; char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")]; char *pend = ip4_addr + sizeof(ip4_addr); const u8 *addr = (const u8 *) &sa->sin_addr.s_addr; char fmt4[3] = { fmt[0], '4', 0 }; fmt++; while (isalpha(*++fmt)) { switch (*fmt) { case 'p': have_p = true; break; case 'h': case 'l': case 'n': case 'b': fmt4[2] = *fmt; break; } } p = ip4_string(ip4_addr, addr, fmt4); if (have_p) { *p++ = ':'; p = number(p, pend, ntohs(sa->sin_port), spec); } *p = '\0'; return string_nocheck(buf, end, ip4_addr, spec); } static noinline_for_stack char *ip_addr_string(char *buf, char *end, const void *ptr, struct printf_spec spec, const char *fmt) { char *err_fmt_msg; if (check_pointer(&buf, end, ptr, spec)) return buf; switch (fmt[1]) { case '6': return ip6_addr_string(buf, end, ptr, spec, fmt); case '4': return ip4_addr_string(buf, end, ptr, spec, fmt); case 'S': { const union { struct sockaddr raw; struct sockaddr_in v4; struct sockaddr_in6 v6; } *sa = ptr; switch (sa->raw.sa_family) { case AF_INET: return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt); case AF_INET6: return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt); default: return error_string(buf, end, "(einval)", spec); }} } err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)"; return error_string(buf, end, err_fmt_msg, spec); } static noinline_for_stack char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, const char *fmt) { bool found = true; int count = 1; unsigned int flags = 0; int len; if (spec.field_width == 0) return buf; /* nothing to print */ if (check_pointer(&buf, end, addr, spec)) return buf; do { switch (fmt[count++]) { case 'a': flags |= ESCAPE_ANY; break; case 'c': flags |= ESCAPE_SPECIAL; break; case 'h': flags |= ESCAPE_HEX; break; case 'n': flags |= ESCAPE_NULL; break; case 'o': flags |= ESCAPE_OCTAL; break; case 'p': flags |= ESCAPE_NP; break; case 's': flags |= ESCAPE_SPACE; break; default: found = false; break; } } while (found); if (!flags) flags = ESCAPE_ANY_NP; len = spec.field_width < 0 ? 1 : spec.field_width; /* * string_escape_mem() writes as many characters as it can to * the given buffer, and returns the total size of the output * had the buffer been big enough. */ buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL); return buf; } #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wsuggest-attribute=format" #endif static char *va_format(char *buf, char *end, struct va_format *va_fmt, struct printf_spec spec) { va_list va; if (check_pointer(&buf, end, va_fmt, spec)) return buf; va_copy(va, *va_fmt->va); buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va); va_end(va); return buf; } #pragma GCC diagnostic pop static noinline_for_stack char *uuid_string(char *buf, char *end, const u8 *addr, struct printf_spec spec, const char *fmt) { char uuid[UUID_STRING_LEN + 1]; char *p = uuid; int i; const u8 *index = uuid_index; bool uc = false; if (check_pointer(&buf, end, addr, spec)) return buf; switch (*(++fmt)) { case 'L': uc = true; fallthrough; case 'l': index = guid_index; break; case 'B': uc = true; break; } for (i = 0; i < 16; i++) { if (uc) p = hex_byte_pack_upper(p, addr[index[i]]); else p = hex_byte_pack(p, addr[index[i]]); switch (i) { case 3: case 5: case 7: case 9: *p++ = '-'; break; } } *p = 0; return string_nocheck(buf, end, uuid, spec); } static noinline_for_stack char *netdev_bits(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'F': num = *(const netdev_features_t *)addr; size = sizeof(netdev_features_t); break; default: return error_string(buf, end, "(%pN?)", spec); } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *fourcc_string(char *buf, char *end, const u32 *fourcc, struct printf_spec spec, const char *fmt) { char output[sizeof("0123 little-endian (0x01234567)")]; char *p = output; unsigned int i; bool pixel_fmt = false; u32 orig, val; if (fmt[1] != 'c') return error_string(buf, end, "(%p4?)", spec); if (check_pointer(&buf, end, fourcc, spec)) return buf; orig = get_unaligned(fourcc); switch (fmt[2]) { case 'h': if (fmt[3] == 'R') orig = swab32(orig); break; case 'l': orig = (__force u32)cpu_to_le32(orig); break; case 'b': orig = (__force u32)cpu_to_be32(orig); break; case 'c': /* Pixel formats are printed LSB-first */ pixel_fmt = true; break; default: return error_string(buf, end, "(%p4?)", spec); } val = pixel_fmt ? swab32(orig & ~BIT(31)) : orig; for (i = 0; i < sizeof(u32); i++) { unsigned char c = val >> ((3 - i) * 8); /* Print non-control ASCII characters as-is, dot otherwise */ *p++ = isascii(c) && isprint(c) ? c : '.'; } if (pixel_fmt) { *p++ = ' '; strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian"); p += strlen(p); } *p++ = ' '; *p++ = '('; p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32)); *p++ = ')'; *p = '\0'; return string(buf, end, output, spec); } static noinline_for_stack char *address_val(char *buf, char *end, const void *addr, struct printf_spec spec, const char *fmt) { unsigned long long num; int size; if (check_pointer(&buf, end, addr, spec)) return buf; switch (fmt[1]) { case 'd': num = *(const dma_addr_t *)addr; size = sizeof(dma_addr_t); break; case 'p': default: num = *(const phys_addr_t *)addr; size = sizeof(phys_addr_t); break; } return special_hex_number(buf, end, num, size); } static noinline_for_stack char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r) { int year = tm->tm_year + (r ? 0 : 1900); int mon = tm->tm_mon + (r ? 0 : 1); buf = number(buf, end, year, default_dec04_spec); if (buf < end) *buf = '-'; buf++; buf = number(buf, end, mon, default_dec02_spec); if (buf < end) *buf = '-'; buf++; return number(buf, end, tm->tm_mday, default_dec02_spec); } static noinline_for_stack char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r) { buf = number(buf, end, tm->tm_hour, default_dec02_spec); if (buf < end) *buf = ':'; buf++; buf = number(buf, end, tm->tm_min, default_dec02_spec); if (buf < end) *buf = ':'; buf++; return number(buf, end, tm->tm_sec, default_dec02_spec); } static noinline_for_stack char *rtc_str(char *buf, char *end, const struct rtc_time *tm, struct printf_spec spec, const char *fmt) { bool have_t = true, have_d = true; bool raw = false, iso8601_separator = true; bool found = true; int count = 2; if (check_pointer(&buf, end, tm, spec)) return buf; switch (fmt[count]) { case 'd': have_t = false; count++; break; case 't': have_d = false; count++; break; } do { switch (fmt[count++]) { case 'r': raw = true; break; case 's': iso8601_separator = false; break; default: found = false; break; } } while (found); if (have_d) buf = date_str(buf, end, tm, raw); if (have_d && have_t) { if (buf < end) *buf = iso8601_separator ? 'T' : ' '; buf++; } if (have_t) buf = time_str(buf, end, tm, raw); return buf; } static noinline_for_stack char *time64_str(char *buf, char *end, const time64_t time, struct printf_spec spec, const char *fmt) { struct rtc_time rtc_time; struct tm tm; time64_to_tm(time, 0, &tm); rtc_time.tm_sec = tm.tm_sec; rtc_time.tm_min = tm.tm_min; rtc_time.tm_hour = tm.tm_hour; rtc_time.tm_mday = tm.tm_mday; rtc_time.tm_mon = tm.tm_mon; rtc_time.tm_year = tm.tm_year; rtc_time.tm_wday = tm.tm_wday; rtc_time.tm_yday = tm.tm_yday; rtc_time.tm_isdst = 0; return rtc_str(buf, end, &rtc_time, spec, fmt); } static noinline_for_stack char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec, const char *fmt) { switch (fmt[1]) { case 'R': return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt); case 'T': return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt); default: return error_string(buf, end, "(%pt?)", spec); } } static noinline_for_stack char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, const char *fmt) { if (!IS_ENABLED(CONFIG_HAVE_CLK)) return error_string(buf, end, "(%pC?)", spec); if (check_pointer(&buf, end, clk, spec)) return buf; #ifdef CONFIG_COMMON_CLK return string(buf, end, __clk_get_name(clk), spec); #else return ptr_to_id(buf, end, clk, spec); #endif } static char *format_flags(char *buf, char *end, unsigned long flags, const struct trace_print_flags *names) { unsigned long mask; for ( ; flags && names->name; names++) { mask = names->mask; if ((flags & mask) != mask) continue; buf = string(buf, end, names->name, default_str_spec); flags &= ~mask; if (flags) { if (buf < end) *buf = '|'; buf++; } } if (flags) buf = number(buf, end, flags, default_flag_spec); return buf; } struct page_flags_fields { int width; int shift; int mask; const struct printf_spec *spec; const char *name; }; static const struct page_flags_fields pff[] = { {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK, &default_dec_spec, "section"}, {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK, &default_dec_spec, "node"}, {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK, &default_dec_spec, "zone"}, {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK, &default_flag_spec, "lastcpupid"}, {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK, &default_flag_spec, "kasantag"}, }; static char *format_page_flags(char *buf, char *end, unsigned long flags) { unsigned long main_flags = flags & PAGEFLAGS_MASK; bool append = false; int i; buf = number(buf, end, flags, default_flag_spec); if (buf < end) *buf = '('; buf++; /* Page flags from the main area. */ if (main_flags) { buf = format_flags(buf, end, main_flags, pageflag_names); append = true; } /* Page flags from the fields area */ for (i = 0; i < ARRAY_SIZE(pff); i++) { /* Skip undefined fields. */ if (!pff[i].width) continue; /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */ if (append) { if (buf < end) *buf = '|'; buf++; } buf = string(buf, end, pff[i].name, default_str_spec); if (buf < end) *buf = '='; buf++; buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask, *pff[i].spec); append = true; } if (buf < end) *buf = ')'; buf++; return buf; } static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) { unsigned long flags; const struct trace_print_flags *names; if (check_pointer(&buf, end, flags_ptr, spec)) return buf; switch (fmt[1]) { case 'p': return format_page_flags(buf, end, *(unsigned long *)flags_ptr); case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; break; case 'g': flags = (__force unsigned long)(*(gfp_t *)flags_ptr); names = gfpflag_names; break; default: return error_string(buf, end, "(%pG?)", spec); } return format_flags(buf, end, flags, names); } static noinline_for_stack char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf, char *end) { int depth; /* Loop starting from the root node to the current node. */ for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) { /* * Only get a reference for other nodes (i.e. parent nodes). * fwnode refcount may be 0 here. */ struct fwnode_handle *__fwnode = depth ? fwnode_get_nth_parent(fwnode, depth) : fwnode; buf = string(buf, end, fwnode_get_name_prefix(__fwnode), default_str_spec); buf = string(buf, end, fwnode_get_name(__fwnode), default_str_spec); if (depth) fwnode_handle_put(__fwnode); } return buf; } static noinline_for_stack char *device_node_string(char *buf, char *end, struct device_node *dn, struct printf_spec spec, const char *fmt) { char tbuf[sizeof("xxxx") + 1]; const char *p; int ret; char *buf_start = buf; struct property *prop; bool has_mult, pass; struct printf_spec str_spec = spec; str_spec.field_width = -1; if (fmt[0] != 'F') return error_string(buf, end, "(%pO?)", spec); if (!IS_ENABLED(CONFIG_OF)) return error_string(buf, end, "(%pOF?)", spec); if (check_pointer(&buf, end, dn, spec)) return buf; /* simple case without anything any more format specifiers */ fmt++; if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0) fmt = "f"; for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) { int precision; if (pass) { if (buf < end) *buf = ':'; buf++; } switch (*fmt) { case 'f': /* full_name */ buf = fwnode_full_name_string(of_fwnode_handle(dn), buf, end); break; case 'n': /* name */ p = fwnode_get_name(of_fwnode_handle(dn)); precision = str_spec.precision; str_spec.precision = strchrnul(p, '@') - p; buf = string(buf, end, p, str_spec); str_spec.precision = precision; break; case 'p': /* phandle */ buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec); break; case 'P': /* path-spec */ p = fwnode_get_name(of_fwnode_handle(dn)); if (!p[1]) p = "/"; buf = string(buf, end, p, str_spec); break; case 'F': /* flags */ tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-'; tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-'; tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-'; tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-'; tbuf[4] = 0; buf = string_nocheck(buf, end, tbuf, str_spec); break; case 'c': /* major compatible string */ ret = of_property_read_string(dn, "compatible", &p); if (!ret) buf = string(buf, end, p, str_spec); break; case 'C': /* full compatible string */ has_mult = false; of_property_for_each_string(dn, "compatible", prop, p) { if (has_mult) buf = string_nocheck(buf, end, ",", str_spec); buf = string_nocheck(buf, end, "\"", str_spec); buf = string(buf, end, p, str_spec); buf = string_nocheck(buf, end, "\"", str_spec); has_mult = true; } break; default: break; } } return widen_string(buf, buf - buf_start, end, spec); } static noinline_for_stack char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, struct printf_spec spec, const char *fmt) { struct printf_spec str_spec = spec; char *buf_start = buf; str_spec.field_width = -1; if (*fmt != 'w') return error_string(buf, end, "(%pf?)", spec); if (check_pointer(&buf, end, fwnode, spec)) return buf; fmt++; switch (*fmt) { case 'P': /* name */ buf = string(buf, end, fwnode_get_name(fwnode), str_spec); break; case 'f': /* full_name */ default: buf = fwnode_full_name_string(fwnode, buf, end); break; } return widen_string(buf, buf - buf_start, end, spec); } static noinline_for_stack char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { if (*fmt == 'r' && fmt[1] == 'a') return range_string(buf, end, ptr, spec, fmt); return resource_string(buf, end, ptr, spec, fmt); } int __init no_hash_pointers_enable(char *str) { if (no_hash_pointers) return 0; no_hash_pointers = true; pr_warn("**********************************************************\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("** **\n"); pr_warn("** This system shows unhashed kernel memory addresses **\n"); pr_warn("** via the console, logs, and other interfaces. This **\n"); pr_warn("** might reduce the security of your system. **\n"); pr_warn("** **\n"); pr_warn("** If you see this message and you are not debugging **\n"); pr_warn("** the kernel, report this immediately to your system **\n"); pr_warn("** administrator! **\n"); pr_warn("** **\n"); pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); pr_warn("**********************************************************\n"); return 0; } early_param("no_hash_pointers", no_hash_pointers_enable); /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format * specifiers. * * Please update scripts/checkpatch.pl when adding/removing conversion * characters. (Search for "check for vsprintf extension"). * * Right now we handle: * * - 'S' For symbolic direct pointers (or function descriptors) with offset * - 's' For symbolic direct pointers (or function descriptors) without offset * - '[Ss]R' as above with __builtin_extract_return_addr() translation * - 'S[R]b' as above with module build ID (for use in backtraces) * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of * %ps and %pS. Be careful when re-using these specifiers. * - 'B' For backtraced symbolic direct pointers with offset * - 'Bb' as above with module build ID (for use in backtraces) * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff] * - 'b[l]' For a bitmap, the number of bits is determined by the field * width which must be explicitly specified either as part of the * format string '%32b[l]' or through '%*b[l]', [l] selects * range-list format instead of hex format * - 'M' For a 6-byte MAC address, it prints the address in the * usual colon-separated hex notation * - 'm' For a 6-byte MAC address, it prints the hex address without colons * - 'MF' For a 6-byte MAC FDDI address, it prints the address * with a dash-separated hex notation * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth) * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way * IPv4 uses dot-separated decimal without leading 0's (1.2.3.4) * IPv6 uses colon separated network-order 16 bit hex with leading 0's * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - 'i' [46] for 'raw' IPv4/IPv6 addresses * IPv6 omits the colons (01020304...0f) * IPv4 uses dot-separated decimal with leading 0's (010.123.045.006) * [S][pfs] * Generic IPv4/IPv6 address (struct sockaddr *) that falls back to * [4] or [6] and is able to print port [p], flowinfo [f], scope [s] * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order * - 'I[6S]c' for IPv6 addresses printed as specified by * https://tools.ietf.org/html/rfc5952 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination * of the following flags (see string_escape_mem() for the * details): * a - ESCAPE_ANY * c - ESCAPE_SPECIAL * h - ESCAPE_HEX * n - ESCAPE_NULL * o - ESCAPE_OCTAL * p - ESCAPE_NP * s - ESCAPE_SPACE * By default ESCAPE_ANY_NP is used. * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form * "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" * Options for %pU are: * b big endian lower case hex (default) * B big endian UPPER case hex * l little endian lower case hex * L little endian UPPER case hex * big endian output byte order is: * [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15] * little endian output byte order is: * [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15] * - 'V' For a struct va_format which contains a format string * and va_list *, * call vsnprintf(->format, *->va_list). * Implements a "recursive vsnprintf". * Do not use this feature without some mechanism to verify the * correctness of the format string and va_list arguments. * - 'K' For a kernel pointer that should be hidden from unprivileged users. * Use only for procfs, sysfs and similar files, not printk(); please * read the documentation (path below) first. * - 'NF' For a netdev_features_t * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value. * - '4c[h[R]lb]' For generic FourCC code with raw numerical value. Both are * displayed in the big-endian format. This is the opposite of V4L2 or * DRM FourCCs. * The additional specifiers define what endianness is used to load * the stored bytes. The data might be interpreted using the host, * reversed host byte order, little-endian, or big-endian. * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with * a certain separator (' ' by default): * C colon * D dash * N no separator * The maximum supported length is 64 bytes of the input. Consider * to use print_hex_dump() for the larger input. * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives * (default assumed to be phys_addr_t, passed by reference) * - 'd[234]' For a dentry name (optionally 2-4 last components) * - 'D[234]' Same as 'd' but for a struct file * - 'g' For block_device name (gendisk + partition number) * - 't[RT][dt][r][s]' For time and date as represented by: * R struct rtc_time * T time64_t * - 'C' For a clock, it prints the name (Common Clock Framework) or address * (legacy clock framework) of the clock * - 'G' For flags to be printed as a collection of symbolic strings that would * construct the specific value. Supported flags given by option: * p page flags (see struct page) given as pointer to unsigned long * g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t * v vma flags (VM_*) given as pointer to unsigned long * - 'OF[fnpPcCF]' For a device tree object * Without any optional arguments prints the full_name * f device node full_name * n device node name * p device node phandle * P device node path spec (name + @unit) * F device node flags * c major compatible string * C full compatible string * - 'fw[fP]' For a firmware node (struct fwnode_handle) pointer * Without an option prints the full name of the node * f full name * P node name, including a possible unit address * - 'x' For printing the address unmodified. Equivalent to "%lx". * Please read the documentation (path below) before using! * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of * bpf_trace_printk() where [ku] prefix specifies either kernel (k) * or user (u) memory to probe, and: * s a string, equivalent to "%s" on direct vsnprintf() use * * ** When making changes please also update: * Documentation/core-api/printk-formats.rst * * Note: The default behaviour (unadorned %p) is to hash the address, * rendering it useful as a unique identifier. * * There is also a '%pA' format specifier, but it is only intended to be used * from Rust code to format core::fmt::Arguments. Do *not* use it from C. * See rust/kernel/print.rs for details. */ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, struct printf_spec spec) { switch (*fmt) { case 'S': case 's': ptr = dereference_symbol_descriptor(ptr); fallthrough; case 'B': return symbol_string(buf, end, ptr, spec, fmt); case 'R': case 'r': return resource_or_range(fmt, buf, end, ptr, spec); case 'h': return hex_string(buf, end, ptr, spec, fmt); case 'b': switch (fmt[1]) { case 'l': return bitmap_list_string(buf, end, ptr, spec, fmt); default: return bitmap_string(buf, end, ptr, spec, fmt); } case 'M': /* Colon separated: 00:01:02:03:04:05 */ case 'm': /* Contiguous: 000102030405 */ /* [mM]F (FDDI) */ /* [mM]R (Reverse order; Bluetooth) */ return mac_address_string(buf, end, ptr, spec, fmt); case 'I': /* Formatted IP supported * 4: 1.2.3.4 * 6: 0001:0203:...:0708 * 6c: 1::708 or 1::1.2.3.4 */ case 'i': /* Contiguous: * 4: 001.002.003.004 * 6: 000102...0f */ return ip_addr_string(buf, end, ptr, spec, fmt); case 'E': return escaped_string(buf, end, ptr, spec, fmt); case 'U': return uuid_string(buf, end, ptr, spec, fmt); case 'V': return va_format(buf, end, ptr, spec); case 'K': return restricted_pointer(buf, end, ptr, spec); case 'N': return netdev_bits(buf, end, ptr, spec, fmt); case '4': return fourcc_string(buf, end, ptr, spec, fmt); case 'a': return address_val(buf, end, ptr, spec, fmt); case 'd': return dentry_name(buf, end, ptr, spec, fmt); case 't': return time_and_date(buf, end, ptr, spec, fmt); case 'C': return clock(buf, end, ptr, spec, fmt); case 'D': return file_dentry_name(buf, end, ptr, spec, fmt); #ifdef CONFIG_BLOCK case 'g': return bdev_name(buf, end, ptr, spec, fmt); #endif case 'G': return flags_string(buf, end, ptr, spec, fmt); case 'O': return device_node_string(buf, end, ptr, spec, fmt + 1); case 'f': return fwnode_string(buf, end, ptr, spec, fmt + 1); case 'A': if (!IS_ENABLED(CONFIG_RUST)) { WARN_ONCE(1, "Please remove %%pA from non-Rust code\n"); return error_string(buf, end, "(%pA?)", spec); } return rust_fmt_argument(buf, end, ptr); case 'x': return pointer_string(buf, end, ptr, spec); case 'e': /* %pe with a non-ERR_PTR gets treated as plain %p */ if (!IS_ERR(ptr)) return default_pointer(buf, end, ptr, spec); return err_ptr(buf, end, ptr, spec); case 'u': case 'k': switch (fmt[1]) { case 's': return string(buf, end, ptr, spec); default: return error_string(buf, end, "(einval)", spec); } default: return default_pointer(buf, end, ptr, spec); } } struct fmt { const char *str; unsigned char state; // enum format_state unsigned char size; // size of numbers }; #define SPEC_CHAR(x, flag) [(x)-32] = flag static unsigned char spec_flag(unsigned char c) { static const unsigned char spec_flag_array[] = { SPEC_CHAR(' ', SPACE), SPEC_CHAR('#', SPECIAL), SPEC_CHAR('+', PLUS), SPEC_CHAR('-', LEFT), SPEC_CHAR('0', ZEROPAD), }; c -= 32; return (c < sizeof(spec_flag_array)) ? spec_flag_array[c] : 0; } /* * Helper function to decode printf style format. * Each call decode a token from the format and return the * number of characters read (or likely the delta where it wants * to go on the next call). * The decoded token is returned through the parameters * * 'h', 'l', or 'L' for integer fields * 'z' support added 23/7/1999 S.H. * 'z' changed to 'Z' --davidm 1/25/99 * 'Z' changed to 'z' --adobriyan 2017-01-25 * 't' added for ptrdiff_t * * @fmt: the format string * @type of the token returned * @flags: various flags such as +, -, # tokens.. * @field_width: overwritten width * @base: base of the number (octal, hex, ...) * @precision: precision of a number * @qualifier: qualifier of a number (long, size_t, ...) */ static noinline_for_stack struct fmt format_decode(struct fmt fmt, struct printf_spec *spec) { const char *start = fmt.str; char flag; /* we finished early by reading the field width */ if (unlikely(fmt.state == FORMAT_STATE_WIDTH)) { if (spec->field_width < 0) { spec->field_width = -spec->field_width; spec->flags |= LEFT; } fmt.state = FORMAT_STATE_NONE; goto precision; } /* we finished early by reading the precision */ if (unlikely(fmt.state == FORMAT_STATE_PRECISION)) { if (spec->precision < 0) spec->precision = 0; fmt.state = FORMAT_STATE_NONE; goto qualifier; } /* By default */ fmt.state = FORMAT_STATE_NONE; for (; *fmt.str ; fmt.str++) { if (*fmt.str == '%') break; } /* Return the current non-format string */ if (fmt.str != start || !*fmt.str) return fmt; /* Process flags. This also skips the first '%' */ spec->flags = 0; do { /* this also skips first '%' */ flag = spec_flag(*++fmt.str); spec->flags |= flag; } while (flag); /* get field width */ spec->field_width = -1; if (isdigit(*fmt.str)) spec->field_width = skip_atoi(&fmt.str); else if (unlikely(*fmt.str == '*')) { /* it's the next argument */ fmt.state = FORMAT_STATE_WIDTH; fmt.str++; return fmt; } precision: /* get the precision */ spec->precision = -1; if (unlikely(*fmt.str == '.')) { fmt.str++; if (isdigit(*fmt.str)) { spec->precision = skip_atoi(&fmt.str); if (spec->precision < 0) spec->precision = 0; } else if (*fmt.str == '*') { /* it's the next argument */ fmt.state = FORMAT_STATE_PRECISION; fmt.str++; return fmt; } } qualifier: /* Set up default numeric format */ spec->base = 10; fmt.state = FORMAT_STATE_NUM; fmt.size = sizeof(int); static const struct format_state { unsigned char state; unsigned char size; unsigned char flags_or_double_size; unsigned char base; } lookup_state[256] = { // Length ['l'] = { 0, sizeof(long), sizeof(long long) }, ['L'] = { 0, sizeof(long long) }, ['h'] = { 0, sizeof(short), sizeof(char) }, ['H'] = { 0, sizeof(char) }, // Questionable historical ['z'] = { 0, sizeof(size_t) }, ['t'] = { 0, sizeof(ptrdiff_t) }, // Non-numeric formats ['c'] = { FORMAT_STATE_CHAR }, ['s'] = { FORMAT_STATE_STR }, ['p'] = { FORMAT_STATE_PTR }, ['%'] = { FORMAT_STATE_PERCENT_CHAR }, // Numerics ['o'] = { FORMAT_STATE_NUM, 0, 0, 8 }, ['x'] = { FORMAT_STATE_NUM, 0, SMALL, 16 }, ['X'] = { FORMAT_STATE_NUM, 0, 0, 16 }, ['d'] = { FORMAT_STATE_NUM, 0, SIGN, 10 }, ['i'] = { FORMAT_STATE_NUM, 0, SIGN, 10 }, ['u'] = { FORMAT_STATE_NUM, 0, 0, 10, }, /* * Since %n poses a greater security risk than * utility, treat it as any other invalid or * unsupported format specifier. */ }; const struct format_state *p = lookup_state + (u8)*fmt.str; if (p->size) { fmt.size = p->size; if (p->flags_or_double_size && fmt.str[0] == fmt.str[1]) { fmt.size = p->flags_or_double_size; fmt.str++; } fmt.str++; p = lookup_state + *fmt.str; } if (p->state) { if (p->base) spec->base = p->base; spec->flags |= p->flags_or_double_size; fmt.state = p->state; fmt.str++; return fmt; } WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt.str); fmt.state = FORMAT_STATE_INVALID; return fmt; } static void set_field_width(struct printf_spec *spec, int width) { spec->field_width = width; if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) { spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX); } } static void set_precision(struct printf_spec *spec, int prec) { spec->precision = prec; if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) { spec->precision = clamp(prec, 0, PRECISION_MAX); } } /* * Turn a 1/2/4-byte value into a 64-bit one for printing: truncate * as necessary and deal with signedness. * * 'size' is the size of the value in bytes. */ static unsigned long long convert_num_spec(unsigned int val, int size, struct printf_spec spec) { unsigned int shift = 32 - size*8; val <<= shift; if (!(spec.flags & SIGN)) return val >> shift; return (int)val >> shift; } /** * vsnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt_str: The format string to use * @args: Arguments for the format string * * This function generally follows C99 vsnprintf, but has some * extensions and a few limitations: * * - ``%n`` is unsupported * - ``%p*`` is handled by pointer() * * See pointer() or Documentation/core-api/printk-formats.rst for more * extensive description. * * **Please update the documentation in both places when making changes** * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. * * If you're not already dealing with a va_list consider using snprintf(). */ int vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args) { char *str, *end; struct printf_spec spec = {0}; struct fmt fmt = { .str = fmt_str, .state = FORMAT_STATE_NONE, }; /* Reject out-of-range values early. Large positive sizes are used for unknown buffer sizes. */ if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt.str) { const char *old_fmt = fmt.str; fmt = format_decode(fmt, &spec); switch (fmt.state) { case FORMAT_STATE_NONE: { int read = fmt.str - old_fmt; if (str < end) { int copy = read; if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; continue; } case FORMAT_STATE_NUM: { unsigned long long num; if (fmt.size <= sizeof(int)) num = convert_num_spec(va_arg(args, int), fmt.size, spec); else num = va_arg(args, long long); str = number(str, end, num, spec); continue; } case FORMAT_STATE_WIDTH: set_field_width(&spec, va_arg(args, int)); continue; case FORMAT_STATE_PRECISION: set_precision(&spec, va_arg(args, int)); continue; case FORMAT_STATE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) va_arg(args, int); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } continue; } case FORMAT_STATE_STR: str = string(str, end, va_arg(args, char *), spec); continue; case FORMAT_STATE_PTR: str = pointer(fmt.str, str, end, va_arg(args, void *), spec); while (isalnum(*fmt.str)) fmt.str++; continue; case FORMAT_STATE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; continue; default: /* * Presumably the arguments passed gcc's type * checking, but there is no safe or sane way * for us to continue parsing the format and * fetching from the va_list; the remaining * specifiers and arguments would be out of * sync. */ goto out; } } out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } /* the trailing null byte doesn't count towards the total */ return str-buf; } EXPORT_SYMBOL(vsnprintf); /** * vscnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @args: Arguments for the format string * * The return value is the number of characters which have been written into * the @buf not including the trailing '\0'. If @size is == 0 the function * returns 0. * * If you're not already dealing with a va_list consider using scnprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vscnprintf(char *buf, size_t size, const char *fmt, va_list args) { int i; if (unlikely(!size)) return 0; i = vsnprintf(buf, size, fmt, args); if (likely(i < size)) return i; return size - 1; } EXPORT_SYMBOL(vscnprintf); /** * snprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters which would be * generated for the given input, excluding the trailing null, * as per ISO C99. If the return is greater than or equal to * @size, the resulting string is truncated. * * See the vsnprintf() documentation for format string extensions over C99. */ int snprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(snprintf); /** * scnprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt: The format string to use * @...: Arguments for the format string * * The return value is the number of characters written into @buf not including * the trailing '\0'. If @size is == 0 the function returns 0. */ int scnprintf(char *buf, size_t size, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vscnprintf(buf, size, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(scnprintf); /** * vsprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @args: Arguments for the format string * * The function returns the number of characters written * into @buf. Use vsnprintf() or vscnprintf() in order to avoid * buffer overflows. * * If you're not already dealing with a va_list consider using sprintf(). * * See the vsnprintf() documentation for format string extensions over C99. */ int vsprintf(char *buf, const char *fmt, va_list args) { return vsnprintf(buf, INT_MAX, fmt, args); } EXPORT_SYMBOL(vsprintf); /** * sprintf - Format a string and place it in a buffer * @buf: The buffer to place the result into * @fmt: The format string to use * @...: Arguments for the format string * * The function returns the number of characters written * into @buf. Use snprintf() or scnprintf() in order to avoid * buffer overflows. * * See the vsnprintf() documentation for format string extensions over C99. */ int sprintf(char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsnprintf(buf, INT_MAX, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sprintf); #ifdef CONFIG_BINARY_PRINTF /* * bprintf service: * vbin_printf() - VA arguments to binary data * bstr_printf() - Binary data to text string */ /** * vbin_printf - Parse a format string and place args' binary value in a buffer * @bin_buf: The buffer to place args' binary value * @size: The size of the buffer(by words(32bits), not characters) * @fmt_str: The format string to use * @args: Arguments for the format string * * The format follows C99 vsnprintf, except %n is ignored, and its argument * is skipped. * * The return value is the number of words(32bits) which would be generated for * the given input. * * NOTE: * If the return value is greater than @size, the resulting bin_buf is NOT * valid for bstr_printf(). */ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt_str, va_list args) { struct fmt fmt = { .str = fmt_str, .state = FORMAT_STATE_NONE, }; struct printf_spec spec = {0}; char *str, *end; int width; str = (char *)bin_buf; end = (char *)(bin_buf + size); #define save_arg(type) \ ({ \ unsigned long long value; \ if (sizeof(type) == 8) { \ unsigned long long val8; \ str = PTR_ALIGN(str, sizeof(u32)); \ val8 = va_arg(args, unsigned long long); \ if (str + sizeof(type) <= end) { \ *(u32 *)str = *(u32 *)&val8; \ *(u32 *)(str + 4) = *((u32 *)&val8 + 1); \ } \ value = val8; \ } else { \ unsigned int val4; \ str = PTR_ALIGN(str, sizeof(type)); \ val4 = va_arg(args, int); \ if (str + sizeof(type) <= end) \ *(typeof(type) *)str = (type)(long)val4; \ value = (unsigned long long)val4; \ } \ str += sizeof(type); \ value; \ }) while (*fmt.str) { fmt = format_decode(fmt, &spec); switch (fmt.state) { case FORMAT_STATE_NONE: case FORMAT_STATE_PERCENT_CHAR: break; case FORMAT_STATE_INVALID: goto out; case FORMAT_STATE_WIDTH: case FORMAT_STATE_PRECISION: width = (int)save_arg(int); /* Pointers may require the width */ if (*fmt.str == 'p') set_field_width(&spec, width); break; case FORMAT_STATE_CHAR: save_arg(char); break; case FORMAT_STATE_STR: { const char *save_str = va_arg(args, char *); const char *err_msg; size_t len; err_msg = check_pointer_msg(save_str); if (err_msg) save_str = err_msg; len = strlen(save_str) + 1; if (str + len < end) memcpy(str, save_str, len); str += len; break; } case FORMAT_STATE_PTR: /* Dereferenced pointers must be done now */ switch (*fmt.str) { /* Dereference of functions is still OK */ case 'S': case 's': case 'x': case 'K': case 'e': save_arg(void *); break; default: if (!isalnum(*fmt.str)) { save_arg(void *); break; } str = pointer(fmt.str, str, end, va_arg(args, void *), spec); if (str + 1 < end) *str++ = '\0'; else end[-1] = '\0'; /* Must be nul terminated */ } /* skip all alphanumeric pointer suffixes */ while (isalnum(*fmt.str)) fmt.str++; break; case FORMAT_STATE_NUM: if (fmt.size > sizeof(int)) { save_arg(long long); } else { save_arg(int); } } } out: return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf; #undef save_arg } EXPORT_SYMBOL_GPL(vbin_printf); /** * bstr_printf - Format a string from binary arguments and place it in a buffer * @buf: The buffer to place the result into * @size: The size of the buffer, including the trailing null space * @fmt_str: The format string to use * @bin_buf: Binary arguments for the format string * * This function like C99 vsnprintf, but the difference is that vsnprintf gets * arguments from stack, and bstr_printf gets arguments from @bin_buf which is * a binary buffer that generated by vbin_printf. * * The format follows C99 vsnprintf, but has some extensions: * see vsnprintf comment for details. * * The return value is the number of characters which would * be generated for the given input, excluding the trailing * '\0', as per ISO C99. If you want to have the exact * number of characters written into @buf as return value * (not including the trailing '\0'), use vscnprintf(). If the * return is greater than or equal to @size, the resulting * string is truncated. */ int bstr_printf(char *buf, size_t size, const char *fmt_str, const u32 *bin_buf) { struct fmt fmt = { .str = fmt_str, .state = FORMAT_STATE_NONE, }; struct printf_spec spec = {0}; char *str, *end; const char *args = (const char *)bin_buf; if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; end = buf + size; #define get_arg(type) \ ({ \ typeof(type) value; \ if (sizeof(type) == 8) { \ args = PTR_ALIGN(args, sizeof(u32)); \ *(u32 *)&value = *(u32 *)args; \ *((u32 *)&value + 1) = *(u32 *)(args + 4); \ } else { \ args = PTR_ALIGN(args, sizeof(type)); \ value = *(typeof(type) *)args; \ } \ args += sizeof(type); \ value; \ }) /* Make sure end is always >= buf */ if (end < buf) { end = ((void *)-1); size = end - buf; } while (*fmt.str) { const char *old_fmt = fmt.str; unsigned long long num; fmt = format_decode(fmt, &spec); switch (fmt.state) { case FORMAT_STATE_NONE: { int read = fmt.str - old_fmt; if (str < end) { int copy = read; if (copy > end - str) copy = end - str; memcpy(str, old_fmt, copy); } str += read; continue; } case FORMAT_STATE_WIDTH: set_field_width(&spec, get_arg(int)); continue; case FORMAT_STATE_PRECISION: set_precision(&spec, get_arg(int)); continue; case FORMAT_STATE_CHAR: { char c; if (!(spec.flags & LEFT)) { while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } } c = (unsigned char) get_arg(char); if (str < end) *str = c; ++str; while (--spec.field_width > 0) { if (str < end) *str = ' '; ++str; } continue; } case FORMAT_STATE_STR: { const char *str_arg = args; args += strlen(str_arg) + 1; str = string(str, end, (char *)str_arg, spec); continue; } case FORMAT_STATE_PTR: { bool process = false; int copy, len; /* Non function dereferences were already done */ switch (*fmt.str) { case 'S': case 's': case 'x': case 'K': case 'e': process = true; break; default: if (!isalnum(*fmt.str)) { process = true; break; } /* Pointer dereference was already processed */ if (str < end) { len = copy = strlen(args); if (copy > end - str) copy = end - str; memcpy(str, args, copy); str += len; args += len + 1; } } if (process) str = pointer(fmt.str, str, end, get_arg(void *), spec); while (isalnum(*fmt.str)) fmt.str++; continue; } case FORMAT_STATE_PERCENT_CHAR: if (str < end) *str = '%'; ++str; continue; case FORMAT_STATE_INVALID: goto out; case FORMAT_STATE_NUM: if (fmt.size > sizeof(int)) { num = get_arg(long long); } else { num = convert_num_spec(get_arg(int), fmt.size, spec); } str = number(str, end, num, spec); continue; } } /* while(*fmt.str) */ out: if (size > 0) { if (str < end) *str = '\0'; else end[-1] = '\0'; } #undef get_arg /* the trailing null byte doesn't count towards the total */ return str - buf; } EXPORT_SYMBOL_GPL(bstr_printf); #endif /* CONFIG_BINARY_PRINTF */ /** * vsscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: format of buffer * @args: arguments */ int vsscanf(const char *buf, const char *fmt, va_list args) { const char *str = buf; char *next; char digit; int num = 0; u8 qualifier; unsigned int base; union { long long s; unsigned long long u; } val; s16 field_width; bool is_sign; while (*fmt) { /* skip any white space in format */ /* white space in format matches any amount of * white space, including none, in the input. */ if (isspace(*fmt)) { fmt = skip_spaces(++fmt); str = skip_spaces(str); } /* anything that is not a conversion must match exactly */ if (*fmt != '%' && *fmt) { if (*fmt++ != *str++) break; continue; } if (!*fmt) break; ++fmt; /* skip this conversion. * advance both strings to next white space */ if (*fmt == '*') { if (!*str) break; while (!isspace(*fmt) && *fmt != '%' && *fmt) { /* '%*[' not yet supported, invalid format */ if (*fmt == '[') return num; fmt++; } while (!isspace(*str) && *str) str++; continue; } /* get field width */ field_width = -1; if (isdigit(*fmt)) { field_width = skip_atoi(&fmt); if (field_width <= 0) break; } /* get conversion qualifier */ qualifier = -1; if (*fmt == 'h' || _tolower(*fmt) == 'l' || *fmt == 'z') { qualifier = *fmt++; if (unlikely(qualifier == *fmt)) { if (qualifier == 'h') { qualifier = 'H'; fmt++; } else if (qualifier == 'l') { qualifier = 'L'; fmt++; } } } if (!*fmt) break; if (*fmt == 'n') { /* return number of characters read so far */ *va_arg(args, int *) = str - buf; ++fmt; continue; } if (!*str) break; base = 10; is_sign = false; switch (*fmt++) { case 'c': { char *s = (char *)va_arg(args, char*); if (field_width == -1) field_width = 1; do { *s++ = *str++; } while (--field_width > 0 && *str); num++; } continue; case 's': { char *s = (char *)va_arg(args, char *); if (field_width == -1) field_width = SHRT_MAX; /* first, skip leading white space in buffer */ str = skip_spaces(str); /* now copy until next white space */ while (*str && !isspace(*str) && field_width--) *s++ = *str++; *s = '\0'; num++; } continue; /* * Warning: This implementation of the '[' conversion specifier * deviates from its glibc counterpart in the following ways: * (1) It does NOT support ranges i.e. '-' is NOT a special * character * (2) It cannot match the closing bracket ']' itself * (3) A field width is required * (4) '%*[' (discard matching input) is currently not supported * * Example usage: * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]", * buf1, buf2, buf3); * if (ret < 3) * // etc.. */ case '[': { char *s = (char *)va_arg(args, char *); DECLARE_BITMAP(set, 256) = {0}; unsigned int len = 0; bool negate = (*fmt == '^'); /* field width is required */ if (field_width == -1) return num; if (negate) ++fmt; for ( ; *fmt && *fmt != ']'; ++fmt, ++len) __set_bit((u8)*fmt, set); /* no ']' or no character set found */ if (!*fmt || !len) return num; ++fmt; if (negate) { bitmap_complement(set, set, 256); /* exclude null '\0' byte */ __clear_bit(0, set); } /* match must be non-empty */ if (!test_bit((u8)*str, set)) return num; while (test_bit((u8)*str, set) && field_width--) *s++ = *str++; *s = '\0'; ++num; } continue; case 'o': base = 8; break; case 'x': case 'X': base = 16; break; case 'i': base = 0; fallthrough; case 'd': is_sign = true; fallthrough; case 'u': break; case '%': /* looking for '%' in str */ if (*str++ != '%') return num; continue; default: /* invalid format; stop here */ return num; } /* have some sort of integer conversion. * first, skip white space in buffer. */ str = skip_spaces(str); digit = *str; if (is_sign && digit == '-') { if (field_width == 1) break; digit = *(str + 1); } if (!digit || (base == 16 && !isxdigit(digit)) || (base == 10 && !isdigit(digit)) || (base == 8 && !isodigit(digit)) || (base == 0 && !isdigit(digit))) break; if (is_sign) val.s = simple_strntoll(str, &next, base, field_width >= 0 ? field_width : INT_MAX); else val.u = simple_strntoull(str, &next, base, field_width >= 0 ? field_width : INT_MAX); switch (qualifier) { case 'H': /* that's 'hh' in format */ if (is_sign) *va_arg(args, signed char *) = val.s; else *va_arg(args, unsigned char *) = val.u; break; case 'h': if (is_sign) *va_arg(args, short *) = val.s; else *va_arg(args, unsigned short *) = val.u; break; case 'l': if (is_sign) *va_arg(args, long *) = val.s; else *va_arg(args, unsigned long *) = val.u; break; case 'L': if (is_sign) *va_arg(args, long long *) = val.s; else *va_arg(args, unsigned long long *) = val.u; break; case 'z': *va_arg(args, size_t *) = val.u; break; default: if (is_sign) *va_arg(args, int *) = val.s; else *va_arg(args, unsigned int *) = val.u; break; } num++; if (!next) break; str = next; } return num; } EXPORT_SYMBOL(vsscanf); /** * sscanf - Unformat a buffer into a list of arguments * @buf: input buffer * @fmt: formatting of buffer * @...: resulting arguments */ int sscanf(const char *buf, const char *fmt, ...) { va_list args; int i; va_start(args, fmt); i = vsscanf(buf, fmt, args); va_end(args); return i; } EXPORT_SYMBOL(sscanf);
1046 1048 159 1048 20 20 13 20 20 20 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /* * Add to a counter while respecting batch size. * * There are 2 implementations, both dealing with the following problem: * * The decision slow path/fast path and the actual update must be atomic. * Otherwise a call in process context could check the current values and * decide that the fast path can be used. If now an interrupt occurs before * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters), * then the this_cpu_add() that is executed after the interrupt has completed * can produce values larger than "batch" or even overflows. */ #ifdef CONFIG_HAVE_CMPXCHG_LOCAL /* * Safety against interrupts is achieved in 2 ways: * 1. the fast path uses local cmpxchg (note: no lock prefix) * 2. the slow path operates with interrupts disabled */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; count = this_cpu_read(*fbc->counters); do { if (unlikely(abs(count + amount) >= batch)) { raw_spin_lock_irqsave(&fbc->lock, flags); /* * Note: by now we might have migrated to another CPU * or the value might have changed. */ count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); return; } } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount)); } #else /* * local_irq_save() is used to make the function irq safe: * - The slow path would be ok as protected by an irq-safe spinlock. * - this_cpu_add would be ok as it is irq-safe by definition. */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; unsigned long flags; local_irq_save(flags); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { raw_spin_lock(&fbc->lock); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock(&fbc->lock); } else { this_cpu_add(*fbc->counters, amount); } local_irq_restore(flags); } #endif EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive(). * * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums * from CPUs that are in the process of being taken offline. Dying cpus have * been removed from the online mask, but may not have had the hotplug dead * notifier called to fold the percpu count back into the global counter sum. * By including dying CPUs in the iteration mask, we avoid this race condition * so __percpu_counter_sum() just does the right thing when CPUs are being taken * offline. */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, struct lock_class_key *key) { unsigned long flags __maybe_unused; size_t counter_size; s32 __percpu *counters; u32 i; counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); counters = __alloc_percpu_gfp(nr_counters * counter_size, __alignof__(*counters), gfp); if (!counters) { fbc[0].counters = NULL; return -ENOMEM; } for (i = 0; i < nr_counters; i++) { raw_spin_lock_init(&fbc[i].lock); lockdep_set_class(&fbc[i].lock, key); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif fbc[i].count = amount; fbc[i].counters = (void __percpu *)counters + i * counter_size; debug_percpu_counter_activate(&fbc[i]); } #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init_many); void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; u32 i; if (WARN_ON_ONCE(!fbc)) return; if (!fbc[0].counters) return; for (i = 0; i < nr_counters; i++) debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); for (i = 0; i < nr_counters; i++) list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc[0].counters); for (i = 0; i < nr_counters; i++) fbc[i].counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); /* * Compare counter, and add amount if total is: less than or equal to limit if * amount is positive, or greater than or equal to limit if amount is negative. * Return true if amount is added, or false if total would be beyond the limit. * * Negative limit is allowed, but unusual. * When negative amounts (subs) are given to percpu_counter_limited_add(), * the limit would most naturally be 0 - but other limits are also allowed. * * Overflow beyond S64_MAX is not allowed for: counter, limit and amount * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) { s64 count; s64 unknown; unsigned long flags; bool good = false; if (amount == 0) return true; local_irq_save(flags); unknown = batch * num_online_cpus(); count = __this_cpu_read(*fbc->counters); /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && ((amount > 0 && fbc->count + unknown <= limit) || (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; } raw_spin_lock(&fbc->lock); count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ if (amount > 0) { if (count - unknown > limit) goto out; if (count + unknown <= limit) good = true; } else { if (count + unknown < limit) goto out; if (count - unknown >= limit) good = true; } if (!good) { s32 *pcount; int cpu; for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } if (amount > 0) { if (count > limit) goto out; } else { if (count < limit) goto out; } good = true; } count = __this_cpu_read(*fbc->counters); fbc->count += count + amount; __this_cpu_sub(*fbc->counters, count); out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; } static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup);
4 3 1 1 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/plist.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> #include "futex.h" /* * READ this before attempting to hack on futexes! * * Basic futex operation and ordering guarantees * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value * again and verifies that the data has not changed. If it has not changed * it enqueues itself into the hash bucket, releases the hash bucket lock * and schedules. * * The waker side modifies the user space value of the futex and calls * futex_wake(). This function computes the hash bucket and acquires the * hash bucket lock. Then it looks for waiters on that futex in the hash * bucket and wakes them. * * In futex wake up scenarios where no tasks are blocked on a futex, taking * the hb spinlock can be avoided and simply return. In order for this * optimization to work, ordering guarantees must exist so that the waiter * being added to the list is acknowledged when the list is concurrently being * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * uval = *futex; * *futex = newval; * sys_futex(WAKE, futex); * futex_wake(futex); * if (queue_empty()) * return; * if (uval == val) * lock(hash_bucket(futex)); * queue(); * unlock(hash_bucket(futex)); * schedule(); * * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. * * The correct serialization ensures that a waiter either observes * the changed user space value before blocking or is woken by a * concurrent waker: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * * waiters++; (a) * smp_mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | * | * uval = *futex; | * | *futex = newval; * | sys_futex(WAKE, futex); * | futex_wake(futex); * | * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); * else wake_waiters(futex); * waiters--; (b) unlock(hash_bucket(futex)); * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write * to futex and the waiters read (see futex_hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * * X = Y = 0 * * w[X]=1 w[Y]=1 * MB MB * r[Y]=y r[X]=x * * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. * * Note that a new waiter is accounted for in (a) even when it is possible that * the wait call can return error, in which case we backtrack from it in (b). * Refer to the comment in futex_q_lock(). * * Similarly, in order to account for waiters being requeued on another * address we always increment the waiters for the destination bucket before * acquiring the lock. It then decrements them again after releasing it - * the code that actually moves the futex(es) between hash buckets (requeue_futex) * will do the additional required waiter count housekeeping. This is done for * double_lock_hb() and double_unlock_hb(), respectively. */ bool __futex_wake_mark(struct futex_q *q) { if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return false; __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL * is written, without taking any locks. This is possible in the event * of a spurious wakeup, for example. A memory barrier is required here * to prevent the following store to lock_ptr from getting ahead of the * plist_del in __futex_unqueue(). */ smp_store_release(&q->lock_ptr, NULL); return true; } /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. Callers * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; get_task_struct(p); if (!__futex_wake_mark(q)) { put_task_struct(p); return; } /* * Queue the task for later wakeup for after we've released * the hb->lock. */ wake_q_add_safe(wake_q, p); } /* * Wake up waiters matching bitset queued on this futex (uaddr). */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); int ret; if (!bitset) return -EINVAL; ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; if ((flags & FLAGS_STRICT) && !nr_wake) return 0; CLASS(hb, hb)(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) return ret; spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (futex_match (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } /* Check if one of the bits is set in both bitsets */ if (!(this->bitset & bitset)) continue; this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); wake_up_q(&wake_q); return ret; } static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", current->comm, oparg); oparg &= 31; } oparg = 1 << oparg; } pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); pagefault_enable(); if (ret) return ret; switch (cmp) { case FUTEX_OP_CMP_EQ: return oldval == cmparg; case FUTEX_OP_CMP_NE: return oldval != cmparg; case FUTEX_OP_CMP_LT: return oldval < cmparg; case FUTEX_OP_CMP_GE: return oldval >= cmparg; case FUTEX_OP_CMP_LE: return oldval <= cmparg; case FUTEX_OP_CMP_GT: return oldval > cmparg; default: return -ENOSYS; } } /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; retry_private: if (1) { CLASS(hb, hb1)(&key1); CLASS(hb, hb2)(&key2); double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { double_unlock_hb(hb1, hb2); if (!IS_ENABLED(CONFIG_MMU) || unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { /* * we don't get EFAULT from MMU faults if we don't have * an MMU, but we might get them from range checking */ ret = op_ret; return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) return ret; } cond_resched(); if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (futex_match(&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } if (op_ret > 0) { op_ret = 0; plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (futex_match(&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } } ret += op_ret; } out_unlock: double_unlock_hb(hb1, hb2); } wake_up_q(&wake_q); return ret; } static long futex_wait_restart(struct restart_block *restart); /** * futex_do_wait() - wait for wakeup, timeout, or signal * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) { /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) schedule(); } __set_current_state(TASK_RUNNING); } /** * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * * Helper to unqueue a list of futexes. This can't fail. * * Return: * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; for (i = 0; i < count; i++) { if (!futex_unqueue(&v[i].q)) ret = i; } return ret; } /** * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes * @vs: The futex list to wait on * @count: The size of the list * @woken: Index of the last woken futex, if any. Used to notify the * caller that it can return this index to userspace (return parameter) * * Prepare multiple futexes in a single step and enqueue them. This may fail if * the futex list is invalid or if any futex was already awoken. On success the * task is ready to interruptible sleep. * * Return: * - 1 - One of the futexes was woken by another thread * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { bool retry = false; int ret, i; u32 uval; /* * Make sure to have a reference on the private_hash such that we * don't block on rehash after changing the task state below. */ guard(private_hash)(); /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to * make sure that current->state is TASK_INTERRUPTIBLE, so we don't * lose any wake events, which cannot be done before the get_futex_key * of the next key, because it calls get_user_pages, which can sleep. * Thus, we fetch the list of futexes keys in two steps, by first * pinning all the memory keys in the futex key, and only then we read * each key and queue the corresponding futex. * * Private futexes doesn't need to recalculate hash in retry, so skip * get_futex_key() when retrying. */ retry: for (i = 0; i < count; i++) { if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) return ret; } set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; if (1) { CLASS(hb, hb)(&q->key); futex_q_lock(q, hb); ret = futex_get_value_locked(&uval, uaddr); if (!ret && uval == val) { /* * The bucket lock can't be held while dealing with the * next futex. Queue each futex at this moment so hb can * be unlocked. */ futex_queue(q, hb, current); continue; } futex_q_unlock(hb); } __set_current_state(TASK_RUNNING); /* * Even if something went wrong, if we find out that a futex * was woken, we don't return error and return this index to * userspace */ *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; if (ret) { /* * If we need to handle a page fault, we need to do so * without any lock and any enqueued futex (otherwise * we could lose some wakeup). So we do it here, after * undoing all the work done so far. In success, we * retry all the work. */ if (get_user(uval, uaddr)) return -EFAULT; retry = true; goto retry; } if (uval != val) return -EWOULDBLOCK; } return 0; } /** * futex_sleep_multiple - Check sleeping conditions and sleep * @vs: List of futexes to wait for * @count: Length of vs * @to: Timeout * * Sleep if and only if the timeout hasn't expired and no futex on the list has * been woken up. */ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { if (to && !to->task) return; for (; count; count--, vs++) { if (!READ_ONCE(vs->q.lock_ptr)) return; } schedule(); } /** * futex_wait_multiple - Prepare to wait on and enqueue several futexes * @vs: The list of futexes to wait on * @count: The number of objects * @to: Timeout before giving up and returning to userspace * * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function * sleeps on a group of futexes and returns on the first futex that is * wake, or after the timeout has elapsed. * * Return: * - >=0 - Hint to the futex that was awoken * - <0 - On error */ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { int ret, hint = 0; if (to) hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); while (1) { ret = futex_wait_multiple_setup(vs, count, &hint); if (ret) { if (ret > 0) { /* A futex was woken during setup */ ret = hint; } return ret; } futex_sleep_multiple(vs, count, to); __set_current_state(TASK_RUNNING); ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; if (to && !to->task) return -ETIMEDOUT; else if (signal_pending(current)) return -ERESTARTSYS; /* * The final case is a spurious wakeup, for * which just retry. */ } } /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @key2: the second futex_key if used for requeue PI * @task: Task queueing this futex * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; * - <0 - On error and the hb is unlocked. A possible reason: the uaddr can not * be read, does not contain the expected value or is not properly aligned. */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, union futex_key *key2, struct task_struct *task) { u32 uval; int ret; /* * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for * any cond. If we locked the hash-bucket after testing *uaddr, that * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * * On the other hand, we insert q and release the hash-bucket only * after testing *uaddr. This guarantees that futex_wait() will NOT * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; retry_private: if (1) { CLASS(hb, hb)(&q->key); futex_q_lock(q, hb); ret = futex_get_value_locked(&uval, uaddr); if (ret) { futex_q_unlock(hb); ret = get_user(uval, uaddr); if (ret) return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } if (uval != val) { futex_q_unlock(hb); return -EWOULDBLOCK; } if (key2 && futex_match(&q->key, key2)) { futex_q_unlock(hb); return -EINVAL; } /* * The task state is guaranteed to be set before another task can * wake it. set_current_state() is implemented using smp_store_mb() and * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ if (task == current) set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb, task); } return ret; } int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; int ret; if (!bitset) return -EINVAL; q.bitset = bitset; retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, NULL, current); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_do_wait(&q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) return 0; if (to && !to->task) return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ if (!signal_pending(current)) goto retry; return -ERESTARTSYS; } int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; int ret; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); ret = __futex_wait(uaddr, flags, val, to, bitset); /* No timeout, nothing to clean up. */ if (!to) return ret; hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); if (ret == -ERESTARTSYS) { restart = &current->restart_block; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; return set_restart_fn(restart, futex_wait_restart); } return ret; } static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP protocol. * * Version: @(#)ip.h 1.0.2 04/28/93 * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IP_H #define _LINUX_IP_H #include <linux/skbuff.h> #include <uapi/linux/ip.h> static inline struct iphdr *ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_network_header(skb); } static inline struct iphdr *inner_ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_inner_network_header(skb); } static inline struct iphdr *ipip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_transport_header(skb); } static inline unsigned int ip_transport_len(const struct sk_buff *skb) { return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb); } static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph) { u32 len = ntohs(iph->tot_len); return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ? len : skb->len - skb_network_offset(skb); } static inline unsigned int skb_ip_totlen(const struct sk_buff *skb) { return iph_totlen(skb, ip_hdr(skb)); } /* IPv4 datagram length is stored into 16bit field (tot_len) */ #define IP_MAX_MTU 0xFFFFU static inline void iph_set_totlen(struct iphdr *iph, unsigned int len) { iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0; } #endif /* _LINUX_IP_H */
20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/xattr.h Extended attributes handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (c) 2001-2002 Silicon Graphics, Inc. All Rights Reserved. Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #ifndef _LINUX_XATTR_H #define _LINUX_XATTR_H #include <linux/slab.h> #include <linux/types.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/user_namespace.h> #include <uapi/linux/xattr.h> /* List of all open_how "versions". */ #define XATTR_ARGS_SIZE_VER0 16 /* sizeof first published struct */ #define XATTR_ARGS_SIZE_LATEST XATTR_ARGS_SIZE_VER0 struct inode; struct dentry; static inline bool is_posix_acl_xattr(const char *name) { return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0); } /* * struct xattr_handler: When @name is set, match attributes with exactly that * name. When @prefix is set instead, match attributes with that prefix and * with a non-empty suffix. */ struct xattr_handler { const char *name; const char *prefix; int flags; /* fs private flags */ bool (*list)(struct dentry *dentry); int (*get)(const struct xattr_handler *, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; /** * xattr_handler_can_list - check whether xattr can be listed * @handler: handler for this type of xattr * @dentry: dentry whose inode xattr to list * * Determine whether the xattr associated with @dentry can be listed given * @handler. * * Return: true if xattr can be listed, false if not. */ static inline bool xattr_handler_can_list(const struct xattr_handler *handler, struct dentry *dentry) { return handler && (!handler->list || handler->list(dentry)); } const char *xattr_full_name(const struct xattr_handler *, const char *); struct xattr { const char *name; void *value; size_t value_len; }; ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t); ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, const char *, struct inode **); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); int xattr_supports_user_prefix(struct inode *inode); static inline const char *xattr_prefix(const struct xattr_handler *handler) { return handler->prefix ?: handler->name; } struct simple_xattrs { struct rb_root rb_root; rwlock_t lock; }; struct simple_xattr { struct rb_node rb_node; char *name; size_t size; char value[]; }; void simple_xattrs_init(struct simple_xattrs *xattrs); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); #endif /* _LINUX_XATTR_H */
371 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corporation, 2021 * * Author: Mike Rapoport <rppt@linux.ibm.com> */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/swap.h> #include <linux/mount.h> #include <linux/memfd.h> #include <linux/bitops.h> #include <linux/printk.h> #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/pseudo_fs.h> #include <linux/secretmem.h> #include <linux/set_memory.h> #include <linux/sched/signal.h> #include <uapi/linux/magic.h> #include <asm/tlbflush.h> #include "internal.h" #undef pr_fmt #define pr_fmt(fmt) "secretmem: " fmt /* * Define mode and flag masks to allow validation of the system call * parameters. */ #define SECRETMEM_MODE_MASK (0x0) #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK static bool secretmem_enable __ro_after_init = 1; module_param_named(enable, secretmem_enable, bool, 0400); MODULE_PARM_DESC(secretmem_enable, "Enable secretmem and memfd_secret(2) system call"); static atomic_t secretmem_users; bool secretmem_active(void) { return !!atomic_read(&secretmem_users); } static vm_fault_t secretmem_fault(struct vm_fault *vmf) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct inode *inode = file_inode(vmf->vma->vm_file); pgoff_t offset = vmf->pgoff; gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; struct folio *folio; vm_fault_t ret; int err; if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return vmf_error(-EINVAL); filemap_invalidate_lock_shared(mapping); retry: page = find_lock_page(mapping, offset); if (!page) { folio = folio_alloc(gfp | __GFP_ZERO, 0); if (!folio) { ret = VM_FAULT_OOM; goto out; } page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { folio_put(folio); ret = vmf_error(err); goto out; } __folio_mark_uptodate(folio); err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid * which guarantees that this call won't fail */ set_direct_map_default_noflush(page); if (err == -EEXIST) goto retry; ret = vmf_error(err); goto out; } addr = (unsigned long)page_address(page); flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } vmf->page = page; ret = VM_FAULT_LOCKED; out: filemap_invalidate_unlock_shared(mapping); return ret; } static const struct vm_operations_struct secretmem_vm_ops = { .fault = secretmem_fault, }; static int secretmem_release(struct inode *inode, struct file *file) { atomic_dec(&secretmem_users); return 0; } static int secretmem_mmap_prepare(struct vm_area_desc *desc) { const unsigned long len = desc->end - desc->start; if ((desc->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) return -EINVAL; if (!mlock_future_ok(desc->mm, desc->vm_flags | VM_LOCKED, len)) return -EAGAIN; desc->vm_flags |= VM_LOCKED | VM_DONTDUMP; desc->vm_ops = &secretmem_vm_ops; return 0; } bool vma_is_secretmem(struct vm_area_struct *vma) { return vma->vm_ops == &secretmem_vm_ops; } static const struct file_operations secretmem_fops = { .release = secretmem_release, .mmap_prepare = secretmem_mmap_prepare, }; static int secretmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } static void secretmem_free_folio(struct folio *folio) { set_direct_map_default_noflush(&folio->page); folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct address_space *mapping = inode->i_mapping; unsigned int ia_valid = iattr->ia_valid; int ret; filemap_invalidate_lock(mapping); if ((ia_valid & ATTR_SIZE) && inode->i_size) ret = -EINVAL; else ret = simple_setattr(idmap, dentry, iattr); filemap_invalidate_unlock(mapping); return ret; } static const struct inode_operations secretmem_iops = { .setattr = secretmem_setattr, }; static struct vfsmount *secretmem_mnt; static struct file *secretmem_file_create(unsigned long flags) { struct file *file; struct inode *inode; const char *anon_name = "[secretmem]"; int err; inode = alloc_anon_inode(secretmem_mnt->mnt_sb); if (IS_ERR(inode)) return ERR_CAST(inode); err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL); if (err) { file = ERR_PTR(err); goto err_free_inode; } file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", O_RDWR, &secretmem_fops); if (IS_ERR(file)) goto err_free_inode; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_op = &secretmem_iops; inode->i_mapping->a_ops = &secretmem_aops; /* pretend we are a normal file with zero size */ inode->i_mode |= S_IFREG; inode->i_size = 0; return file; err_free_inode: iput(inode); return file; } SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) { struct file *file; int fd, err; /* make sure local flags do not confict with global fcntl.h */ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); if (!secretmem_enable || !can_set_direct_map()) return -ENOSYS; if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) return -EINVAL; if (atomic_read(&secretmem_users) < 0) return -ENFILE; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; file = secretmem_file_create(flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_put_fd; } file->f_flags |= O_LARGEFILE; atomic_inc(&secretmem_users); fd_install(fd, file); return fd; err_put_fd: put_unused_fd(fd); return err; } static int secretmem_init_fs_context(struct fs_context *fc) { return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type secretmem_fs = { .name = "secretmem", .init_fs_context = secretmem_init_fs_context, .kill_sb = kill_anon_super, }; static int __init secretmem_init(void) { if (!secretmem_enable || !can_set_direct_map()) return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) return PTR_ERR(secretmem_mnt); /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; return 0; } fs_initcall(secretmem_init);
287 285 285 285 285 285 285 285 1 1 287 287 1 95 285 248 241 241 287 287 241 241 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/string.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/rbtree.h> #include <linux/wait.h> #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> #include <linux/capability.h> #include <net/busy_poll.h> /* * LOCKING: * There are three level of locking required by epoll : * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * The epnested_mutex is acquired when inserting an epoll fd onto another * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) struct epoll_filefd { struct file *file; int fd; } __packed; /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next; /* The "base" pointer is set to the container "struct epitem" */ struct epitem *base; /* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; }; /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* * Protected by file->f_lock, true for to-be-released epitem already * removed from the "struct file" items list; together with * eventpoll->refcount orchestrates "struct eventpoll" disposal */ bool dying; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; /* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* Lock which protects rdllist and ovflist */ rwlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ struct user_struct *user; struct file *file; /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; /* * usage count, used together with epitem->dying to * orchestrate the disposal of this struct */ refcount_t refcount; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; /* busy poll timeout */ u32 busy_poll_usecs; /* busy poll packet budget */ u16 busy_poll_budget; bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* tracks wakeup nests for lockdep validation */ u8 nests; #endif }; /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; /* * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; /* Used for cycles detection */ static DEFINE_MUTEX(epnested_mutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epnested_mutex. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { if (head) kmem_cache_free(ephead_cache, head); } static void list_file(struct file *file) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { head->next = tfile_check_list; tfile_check_list = head; } } static void unlist_file(struct epitems_head *head) { struct epitems_head *to_free = head; struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); if (p) { struct epitem *epi= container_of(p, struct epitem, fllink); spin_lock(&epi->ffd.file->f_lock); if (!hlist_empty(&head->epitems)) to_free = NULL; head->next = NULL; spin_unlock(&epi->ffd.file->f_lock); } free_ephead(to_free); } #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long long_zero; static long long_max = LONG_MAX; static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &long_zero, .extra2 = &long_max, }, }; static void __init epoll_sysctls_init(void) { register_sysctl("fs/epoll", epoll_table); } #else #define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { ffd->file = file; ffd->fd = fd; } /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL /** * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value * from the epoll instance ep is preferred, but if it is not set fallback to * the system-wide global via busy_loop_timeout. * * @start_time: The start time used to compute the remaining time until timeout. * @ep: Pointer to the eventpoll context. * * Return: true if the timeout has expired, false otherwise. */ static bool busy_loop_ep_timeout(unsigned long start_time, struct eventpoll *ep) { unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } else { return busy_loop_timeout(start_time); } } static bool ep_busy_loop_on(struct eventpoll *ep) { return !!READ_ONCE(ep->busy_poll_usecs) || READ_ONCE(ep->prefer_busy_poll) || net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled */ static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (!budget) budget = BUSY_POLL_BUDGET; if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ if (prefer_busy_poll) napi_resume_irqs(napi_id); ep->napi_id = 0; return false; } return false; } /* * Set epoll busy poll NAPI ID from sk. */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); if (!sock) return; sk = sock->sk; if (!sk) return; napi_id = READ_ONCE(sk->sk_napi_id); /* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID */ if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct eventpoll *ep = file->private_data; void __user *uarg = (void __user *)arg; struct epoll_params epoll_params; switch (cmd) { case EPIOCSPARAMS: if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) return -EFAULT; /* pad byte must be zero */ if (epoll_params.__pad) return -EINVAL; if (epoll_params.busy_poll_usecs > S32_MAX) return -EINVAL; if (epoll_params.prefer_busy_poll > 1) return -EINVAL; if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && !capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); return 0; case EPIOCGPARAMS: memset(&epoll_params, 0, sizeof(epoll_params)); epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) return -EFAULT; return 0; default: return -ENOIOCTLCMD; } } static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } #else static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } static void ep_suspend_napi_irqs(struct eventpoll *ep) { } static void ep_resume_napi_irqs(struct eventpoll *ep) { } #endif /* CONFIG_NET_RX_BUSY_POLL */ /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; u8 nests = 0; /* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock. */ if (epi) { if ((is_file_epoll(epi->ffd.file))) { ep_src = epi->ffd.file->private_data; nests = ep_src->nests; } else { nests = 1; } } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock. */ whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; while ((pwq = *p) != NULL) { *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } /* call only when ep->mtx is held */ static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) { return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); } /* call only when ep->mtx is held */ static inline void ep_pm_stay_awake(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); if (ws) __pm_stay_awake(ws); } static inline bool ep_has_wakeup_source(struct epitem *epi) { return rcu_access_pointer(epi->ws) ? true : false; } /* call when ep->mtx cannot be held (ep_poll_callback) */ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) { struct wakeup_source *ws; rcu_read_lock(); ws = rcu_dereference(epi->ws); if (ws) __pm_stay_awake(ws); rcu_read_unlock(); } /* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, struct list_head *txlist) { struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); } write_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); } /* * Returns true if the event poll can be disposed */ static bool ep_refcount_dec_and_test(struct eventpoll *ep) { if (!refcount_dec_and_test(&ep->refcount)) return false; WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); return true; } static void ep_free(struct eventpoll *ep) { ep_resume_napi_irqs(ep); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); kfree(ep); } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. * If the dying flag is set, do the removal only if force is true. * This prevents ep_clear_and_put() from dropping all the ep references * while running concurrently with eventpoll_release_file(). * Returns true if the eventpoll can be disposed. */ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head; lockdep_assert_irqs_enabled(); /* * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); if (epi->dying && !force) { spin_unlock(&file->f_lock); return false; } to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next)) to_free = v; } } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); } /* * ep_remove variant for callers owing an additional reference to the ep */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { WARN_ON_ONCE(__ep_remove(ep, epi, false)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } /* * Walks through the whole tree and try to free each "struct epitem". * Note that ep_remove_safe() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't * dispose it. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); ep_remove_safe(ep, epi); cond_resched(); } dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); } static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int ret; if (!is_file_epoll(file)) return -EINVAL; switch (cmd) { case EPIOCSPARAMS: case EPIOCGPARAMS: ret = ep_eventpoll_bp_ioctl(file, cmd, arg); break; default: ret = -EINVAL; break; } return ret; } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_clear_and_put(ep); return 0; } static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } /* * The ffd.file pointer may be in the process of being torn down due to * being closed, but we may not have finished eventpoll_release() yet. * * Normally, even with the atomic_long_inc_not_zero, the file may have * been free'd and then gotten re-allocated to something else (since * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). * * But for epoll, users hold the ep->mtx mutex, and as such any file in * the process of being free'd will block in eventpoll_release_file() * and thus the underlying file allocation will not be free'd, and the * file re-use cannot happen. * * For the same reason we can avoid a rcu_read_lock() around the * operation - 'ffd.file' cannot go away even if the refcount has * reached zero (but we must still not call out to ->poll() functions * etc). */ static struct file *epi_fget(const struct epitem *epi) { struct file *file; file = epi->ffd.file; if (!file_ref_get(&file->f_ref)) file = NULL; return file; } /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct file *file = epi_fget(epi); __poll_t res; /* * We could return EPOLLERR | EPOLLHUP or something, but let's * treat this more as "file doesn't exist, poll didn't happen". */ if (!file) return 0; pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); fput(file); return res & epi->event.events; } static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; mutex_lock(&ep->mtx); for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); } #endif /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, .unlocked_ioctl = ep_eventpoll_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; bool dispose; /* * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from * touching the epitems list before eventpoll_release_file() can access * the ep->mtx. */ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); epi->dying = true; spin_unlock(&file->f_lock); /* * ep access is safe as we still own a reference to the ep * struct */ ep = epi->ep; mutex_lock(&ep->mtx); dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); goto again; } spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) { struct eventpoll *ep; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } #ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; struct epitem *epi; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) return epi; else toff--; } cond_resched(); } return NULL; } struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff) { struct file *file_raw; struct eventpoll *ep; struct epitem *epi; if (!is_file_epoll(file)) return ERR_PTR(-EINVAL); ep = file->private_data; mutex_lock(&ep->mtx); epi = ep_find_tfd(ep, tfd, toff); if (epi) file_raw = epi->ffd.file; else file_raw = ERR_PTR(-ENOENT); mutex_unlock(&ep->mtx); return file_raw; } #endif /* CONFIG_KCMP */ /* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * * Beware: it is necessary to prevent any other modifications of the * existing list until all changes are completed, in other words * concurrent list_add_tail_lockless() calls should be protected * with a read lock, where write lock acts as a barrier which * makes sure all list_add_tail_lockless() calls are fully * completed. * * Also an element can be locklessly added to the list only in one * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, struct list_head *head) { struct list_head *prev; /* * This is simple 'new->next = head' operation, but cmpxchg() * is used in order to detect that same element has been just * added to the list from another CPU: the winner observes * new->next == new. */ if (!try_cmpxchg(&new->next, &new, head)) return false; /* * Initially ->next of a new element must be updated with the head * (we are inserting to the tail) and only then pointers are atomically * exchanged. XCHG guarantees memory ordering, thus ->next should be * updated before pointers are actually swapped and pointers are * swapped before prev->next is updated. */ prev = xchg(&head->prev, new); /* * It is safe to modify prev->next and new->prev, because a new element * is added only to the tail and new->next is updated before XCHG. */ prev->next = new; new->prev = prev; return true; } /* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; /* Fast preliminary check */ if (epi->next != EP_UNACTIVE_PTR) return false; /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; /* Atomically exchange tail */ epi->next = xchg(&ep->ovflist, epi); return true; } /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. * * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called * concurrently for the same @epi from different CPUs if poll table was inited * with several wait queues entries. Plural wakeup from different CPUs of a * single wait queue is serialized by wq.lock, but the case when multiple wait * queues are used should be detected accordingly. This is detected using * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); unsigned long flags; int ewake = 0; read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } if (sync) wake_up_sync(&ep->wq); else wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue(). */ list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() * takes whead->lock held by the caller. Once we nullify it, * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } return ewake; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq; if (unlikely(!epi)) // an earlier allocation has failed return; pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); if (unlikely(!pwq)) { epq->epi = NULL; return; } init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); pwq->next = epi->pwqlist; epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) { p = &parent->rb_right; leftmost = false; } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; if (++path_count[nests] > path_limits[nests]) return -1; return 0; } static void path_count_init(void) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) path_count[i] = 0; } static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) error = path_count_inc(depth); else error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } return error; } /** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ static int reverse_path_check(void) { struct epitems_head *p; for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error; path_count_init(); rcu_read_lock(); error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) return error; } return 0; } static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); ws = wakeup_source_register(NULL, n.name.name); release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; rcu_assign_pointer(epi->ws, ws); return 0; } /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); RCU_INIT_POINTER(epi->ws, NULL); /* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu */ synchronize_rcu(); wakeup_source_unregister(ws); } static int attach_epitem(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; struct eventpoll *ep = NULL; if (is_file_epoll(file)) ep = file->private_data; if (ep) { head = &ep->refs; } else if (!READ_ONCE(file->f_ep)) { allocate: to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); if (!to_free) return -ENOMEM; head = &to_free->epitems; } spin_lock(&file->f_lock); if (!file->f_ep) { if (unlikely(!head)) { spin_unlock(&file->f_lock); goto allocate; } /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); spin_unlock(&file->f_lock); free_ephead(to_free); return 0; } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, pwake = 0; __poll_t revents; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; if (is_file_epoll(tfile)) tep = tfile->private_data; lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) return -ENOSPC; percpu_counter_inc(&ep->user->epoll_watches); if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); if (tep) mutex_unlock(&tep->mtx); /* * ep_remove_safe() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference. */ ep_get(ep); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { ep_remove_safe(ep, epi); return error; } } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (unlikely(!epq.epi)) { ep_remove_safe(ep, epi); return -ENOMEM; } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, const struct epoll_event *event) { int pwake = 0; poll_table pt; lockdep_assert_irqs_enabled(); init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; LIST_HEAD(txlist); poll_table pt; int res = 0; /* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly. */ if (fatal_signal_pending(current)) return -EINTR; init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx. */ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; if (res >= maxevents) break; /* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; events = epoll_put_uevent(revents, epi->event.data, events); if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) res = -EFAULT; break; } res++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) { struct timespec64 now; if (ms < 0) return NULL; if (!ms) { to->tv_sec = 0; to->tv_nsec = 0; return to; } to->tv_sec = ms / MSEC_PER_SEC; to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); ktime_get_ts64(&now); *to = timespec64_add_safe(now, *to); return to; } /* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it. */ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); /* * Pairs with list_empty_careful in ep_poll, and ensures future loop * iterations see the cause of this wakeup. */ list_del_init_careful(&wq_entry->entry); return ret; } static int ep_try_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { int res; /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ res = ep_send_events(ep, events, maxevents); if (res > 0) ep_suspend_napi_irqs(ep); return res; } static int ep_schedule_timeout(ktime_t *to) { if (to) return ktime_after(*to, ktime_get()); else return 1; } /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { int res, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { slack = select_estimate_accuracy(timeout); to = &expires; *to = timespec64_to_ktime(*timeout); } else if (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; } /* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ eavail = ep_events_available(ep); while (1) { if (eavail) { res = ep_try_send_events(ep, events, maxevents); if (res) return res; } if (timed_out) return 0; eavail = ep_busy_loop(ep); if (eavail) continue; if (signal_pending(current)) return -EINTR; /* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally. */ init_wait(&wait); wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier. */ __set_current_state(TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important. */ eavail = ep_events_available(ep); if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); write_unlock_irq(&ep->lock); if (!eavail) timed_out = !ep_schedule_timeout(to) || !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ eavail = 1; if (!list_empty_careful(&wait.entry)) { write_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events. */ if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); write_unlock_irq(&ep->lock); } } } /** * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: %zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) error = -1; else error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert(). */ list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); return error; } /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { inserting_into = ep; return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) { rcu_read_lock(); while (tfile_check_list != EP_UNACTIVE_PTR) { struct epitems_head *head = tfile_check_list; tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); } /* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_clear_and_put(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); } #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epev->events &= ~EPOLLWAKEUP; } #else static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { epev->events &= ~EPOLLWAKEUP; } #endif static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { if (!nonblock) { mutex_lock_nested(mutex, depth); return 0; } if (mutex_trylock(mutex)) return 0; return -EAGAIN; } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock) { int error; int full_check = 0; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; /* Get the "struct file *" for the target file */ CLASS(fd, tf)(fd); if (fd_empty(tf)) return -EBADF; /* The target file descriptor must support poll */ if (!file_can_poll(fd_file(tf))) return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || is_file_epoll(fd_file(tf))) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; if (is_file_epoll(fd_file(tf))) { tep = fd_file(tf)->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; } } /* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, fd_file(tf), fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, epds, fd_file(tf), fd, full_check); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) { /* * The eventpoll itself is still alive: the refcount * can't go to zero here. */ ep_remove_safe(ep, epi); error = 0; } else { error = -ENOENT; } break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, epds); } } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; mutex_unlock(&epnested_mutex); } return error; } /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { struct epoll_event epds; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT; return do_epoll_ctl(epfd, op, fd, &epds, false); } static int ep_check_params(struct file *file, struct epoll_event __user *evs, int maxevents) { /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ if (!is_file_epoll(file)) return -EINVAL; return 0; } int epoll_sendevents(struct file *file, struct epoll_event __user *events, int maxevents) { struct eventpoll *ep; int ret; ret = ep_check_params(file, events, maxevents); if (unlikely(ret)) return ret; ep = file->private_data; /* * Racy call, but that's ok - it should get retried based on * poll readiness anyway. */ if (ep_events_available(ep)) return ep_try_send_events(ep, events, maxevents); return 0; } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; ret = ep_check_params(fd_file(f), events, maxevents); if (unlikely(ret)) return ret; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* Time to fish for events ... */ return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { struct timespec64 to; return do_epoll_wait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout)); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ static int do_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to, const sigset_t __user *sigmask, size_t sigsetsize) { int error; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, to); restore_saved_sigmask_unless(error == -EINTR); return error; } SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 to; return do_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { long err; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); restore_saved_sigmask_unless(err == -EINTR); return err; } COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 to; return do_compat_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_compat_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #endif static int __init eventpoll_init(void) { struct sysinfo si; si_meminfo(&si); /* * Allows top 4% of lomem to be allocated for epoll watches (per user). */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; BUG_ON(max_user_watches < 0); /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } fs_initcall(eventpoll_init);
30 31 31 31 31 31 31 30 30 29 29 1 28 7 6 1 6 287 287 287 287 287 287 287 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic socket support routines. Memory allocators, socket lock/release * handler for protocols to use and generic option handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Fixes: * Alan Cox : Numerous verify_area() problems * Alan Cox : Connecting on a connecting socket * now returns an error for tcp. * Alan Cox : sock->protocol is set correctly. * and is not sometimes left as 0. * Alan Cox : connect handles icmp errors on a * connect properly. Unfortunately there * is a restart syscall nasty there. I * can't match BSD without hacking the C * library. Ideas urgently sought! * Alan Cox : Disallow bind() to addresses that are * not ours - especially broadcast ones!! * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, * instead they leave that for the DESTROY timer. * Alan Cox : Clean up error flag in accept * Alan Cox : TCP ack handling is buggy, the DESTROY timer * was buggy. Put a remove_sock() in the handler * for memory when we hit 0. Also altered the timer * code. The ACK stuff can wait and needs major * TCP layer surgery. * Alan Cox : Fixed TCP ack bug, removed remove sock * and fixed timer/inet_bh race. * Alan Cox : Added zapped flag for TCP * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... * Rick Sladkey : Relaxed UDP rules for matching packets. * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support * Pauline Middelink : identd support * Alan Cox : Fixed connect() taking signals I think. * Alan Cox : SO_LINGER supported * Alan Cox : Error reporting fixes * Anonymous : inet_create tidied up (sk->reuse setting) * Alan Cox : inet sockets don't set sk->type! * Alan Cox : Split socket option code * Alan Cox : Callbacks * Alan Cox : Nagle flag for Charles & Johannes stuff * Alex : Removed restriction on inet fioctl * Alan Cox : Splitting INET from NET core * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code * Alan Cox : Split IP from generic code * Alan Cox : New kfree_skbmem() * Alan Cox : Make SO_DEBUG superuser only. * Alan Cox : Allow anyone to clear SO_DEBUG * (compatibility fix) * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. * Alan Cox : Allocator for a socket is settable. * Alan Cox : SO_ERROR includes soft errors. * Alan Cox : Allow NULL arguments on some SO_ opts * Alan Cox : Generic socket allocation to make hooks * easier (suggested by Craig Metz). * Michael Pall : SO_ERROR returns positive errno again * Steve Whitehouse: Added default destructor to free * protocol private data. * Steve Whitehouse: Added various other default routines * common to several socket families. * Chris Evans : Call suser() check last on F_SETOWN * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() * Andi Kleen : Fix write_space callback * Chris Evans : Security fixes - signedness again * Arnaldo C. Melo : cleanups, use skb_queue_purge * * To Fix: */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/poll.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/init.h> #include <linux/highmem.h> #include <linux/user_namespace.